[CalendarServer-changes] [14665] twext/trunk/twext/enterprise

Wed Apr 8 06:51:33 PDT 2015

Revision: 14665
          http://trac.calendarserver.org//changeset/14665
Author:   cdaboo at apple.com
Date:     2015-04-08 06:51:33 -0700 (Wed, 08 Apr 2015)
Log Message:
-----------
Support temporary failures for job queue items.

Modified Paths:
--------------
    twext/trunk/twext/enterprise/jobqueue.py
    twext/trunk/twext/enterprise/test/test_jobqueue.py

Modified: twext/trunk/twext/enterprise/jobqueue.py
===================================================================

--- twext/trunk/twext/enterprise/jobqueue.py	2015-04-08 13:50:32 UTC (rev 14664)
+++ twext/trunk/twext/enterprise/jobqueue.py	2015-04-08 13:51:33 UTC (rev 14665)
@@ -326,6 +326,21 @@
 
 
 
+class JobTemporaryError(Exception):
+    """
+    A job failed to run due to a temporary failure. We will get the job to run again after the specified
+    interval (with a built-in back-off based on the number of failures also applied).
+    """
+
+    def __init__(self, delay):
+        """
+        @param delay: amount of time in seconds before it should run again
+        @type delay: L{int}
+        """
+        self.delay = delay
+
+
+
 class JobRunningError(Exception):
     """
     A job is already running.
@@ -452,6 +467,28 @@
         def _overtm(nb):
             return "{:.0f}".format(1000 * (t - astimestamp(nb)))
 
+        # Failed job clean-up
+        def _failureCleanUp(delay=None):
+            @inlineCallbacks
+            def _cleanUp2(txn2):
+                try:
+                    job = yield cls.load(txn2, jobID)
+                except NoSuchRecord:
+                    log.debug(
+                        "JobItem: {jobid} disappeared t={tm}",
+                        jobid=jobID,
+                        tm=_tm(),
+                    )
+                else:
+                    log.debug(
+                        "JobItem: {jobid} marking as failed {count} t={tm}",
+                        jobid=jobID,
+                        count=job.failed + 1,
+                        tm=_tm(),
+                    )
+                    yield job.failedToRun(locked=isinstance(e, JobRunningError), delay=delay)
+            return inTransaction(txnFactory, _cleanUp2, "ultimatelyPerform._failureCleanUp")
+
         log.debug("JobItem: {jobid} starting to run", jobid=jobID)
         txn = txnFactory(label="ultimatelyPerform: {}".format(jobID))
         try:
@@ -475,28 +512,24 @@
                 tm=_tm(),
             )
 
+        except JobTemporaryError as e:
+
+            # Temporary failure delay with back-off
+            def _temporaryFailure():
+                return _failureCleanUp(delay=e.delay * (job.failed + 1))
+            log.debug(
+                "JobItem: {jobid} {desc} {work} t={tm}",
+                jobid=jobID,
+                desc="temporary failure #{}".format(job.failed + 1),
+                work=job.workType,
+                tm=_tm(),
+            )
+            txn.postAbort(_temporaryFailure)
+            yield txn.abort()
+
         except (JobFailedError, JobRunningError) as e:
-            # Job failed: abort with cleanup, but pretend this method succeeded
-            def _cleanUp():
-                @inlineCallbacks
-                def _cleanUp2(txn2):
-                    try:
-                        job = yield cls.load(txn2, jobID)
-                    except NoSuchRecord:
-                        log.debug(
-                            "JobItem: {jobid} disappeared t={tm}",
-                            jobid=jobID,
-                            tm=_tm(),
-                        )
-                    else:
-                        log.debug(
-                            "JobItem: {jobid} marking as failed {count} t={tm}",
-                            jobid=jobID,
-                            count=job.failed + 1,
-                            tm=_tm(),
-                        )
-                        yield job.failedToRun(locked=isinstance(e, JobRunningError))
-                return inTransaction(txnFactory, _cleanUp2, "ultimatelyPerform._cleanUp")
+
+            # Permanent failure
             log.debug(
                 "JobItem: {jobid} {desc} {work} t={tm}",
                 jobid=jobID,
@@ -504,7 +537,7 @@
                 work=job.workType,
                 tm=_tm(),
             )
-            txn.postAbort(_cleanUp)
+            txn.postAbort(_failureCleanUp)
             yield txn.abort()
 
         except:
@@ -645,7 +678,10 @@
                     workid=workItem.workID,
                     exc=f,
                 )
-                raise JobFailedError(e)
+                if isinstance(e, JobTemporaryError):
+                    raise
+                else:
+                    raise JobFailedError(e)
 
         try:
             # Once the work is done we delete ourselves - NB this must be the last thing done

Modified: twext/trunk/twext/enterprise/test/test_jobqueue.py
===================================================================
--- twext/trunk/twext/enterprise/test/test_jobqueue.py	2015-04-08 13:50:32 UTC (rev 14664)
+++ twext/trunk/twext/enterprise/test/test_jobqueue.py	2015-04-08 13:51:33 UTC (rev 14665)
@@ -38,14 +38,13 @@
 from twext.enterprise.dal.test.test_parseschema import SchemaTestHelper
 from twext.enterprise.fixtures import buildConnectionPool
 from twext.enterprise.fixtures import SteppablePoolHelper
-from twext.enterprise.jobqueue import (
-    inTransaction, PeerConnectionPool, astimestamp,
-    LocalPerformer, _IJobPerformer, WorkItem, WorkerConnectionPool,
-    ConnectionFromPeerNode,
-    _BaseQueuer, NonPerformingQueuer, JobItem,
-    WORK_PRIORITY_LOW, WORK_PRIORITY_HIGH, WORK_PRIORITY_MEDIUM,
-    JobDescriptor, SingletonWorkItem, JobFailedError
-)
+from twext.enterprise.jobqueue import \
+    inTransaction, PeerConnectionPool, astimestamp, \
+    LocalPerformer, _IJobPerformer, WorkItem, WorkerConnectionPool, \
+    ConnectionFromPeerNode, \
+    _BaseQueuer, NonPerformingQueuer, JobItem, \
+    WORK_PRIORITY_LOW, WORK_PRIORITY_HIGH, WORK_PRIORITY_MEDIUM, \
+    JobDescriptor, SingletonWorkItem, JobFailedError, JobTemporaryError
 import twext.enterprise.jobqueue
 
 # TODO: There should be a store-building utility within twext.enterprise.
@@ -280,6 +279,8 @@
     def doWork(self):
         if self.a == -1:
             raise ValueError("Ooops")
+        elif self.a == -2:
+            raise JobTemporaryError(120)
         self.results[self.jobID] = self.a + self.b
         return succeed(None)
 
@@ -1000,30 +1001,23 @@
         @transactionally(dbpool.pool.connection)
         @inlineCallbacks
         def setup(txn):
-            # First, one that's right now.
+            # OK
             yield DummyWorkItem.makeJob(
                 txn, a=1, b=0, notBefore=fakeNow - datetime.timedelta(20 * 60)
             )
 
-            # Next, create one that's actually far enough into the past to run.
+            # Error
             yield DummyWorkItem.makeJob(
                 txn, a=-1, b=1, notBefore=fakeNow - datetime.timedelta(20 * 60)
             )
 
-            # Finally, one that's actually scheduled for the future.
+            # OK
             yield DummyWorkItem.makeJob(
                 txn, a=2, b=0, notBefore=fakeNow - datetime.timedelta(20 * 60)
             )
         yield setup
         clock.advance(20 - 12)
 
-        # Wait for job
-#        while True:
-#            jobs = yield inTransaction(dbpool.pool.connection, lambda txn: JobItem.all(txn))
-#            if all([job.a == -1 for job in jobs]):
-#                break
-#            clock.advance(1)
-
         # Work item complete
         self.assertTrue(DummyWorkItem.results == {1: 1, 3: 2})
 
@@ -1062,7 +1056,41 @@
         self.assertTrue(jobs[0].notBefore > datetime.datetime.utcnow())
 
 
+    @inlineCallbacks
+    def test_temporaryFailure(self):
+        """
+        When a work item temporARILY fails it should appear as unassigned in the JOB
+        table and have the failure count bumped, and a notBefore set to the temporary delay.
+        """
+        dbpool, _ignore_qpool, clock, _ignore_performerChosen = self._setupPools()
+        fakeNow = datetime.datetime(2012, 12, 12, 12, 12, 12)
 
+        # Let's create a couple of work items directly, not via the enqueue
+        # method, so that they exist but nobody will try to immediately execute
+        # them.
+
+        @transactionally(dbpool.pool.connection)
+        @inlineCallbacks
+        def setup(txn):
+            # Next, create failing work that's actually far enough into the past to run.
+            yield DummyWorkItem.makeJob(
+                txn, a=-2, b=1, notBefore=fakeNow - datetime.timedelta(20 * 60)
+            )
+        yield setup
+        clock.advance(20 - 12)
+
+        @transactionally(dbpool.pool.connection)
+        def check(txn):
+            return JobItem.all(txn)
+
+        jobs = yield check
+        self.assertTrue(len(jobs) == 1)
+        self.assertTrue(jobs[0].assigned is None)
+        self.assertTrue(jobs[0].failed == 1)
+        self.assertTrue(jobs[0].notBefore > datetime.datetime.utcnow() + datetime.timedelta(seconds=90))
+
+
+
 class HalfConnection(object):
     def __init__(self, protocol):
         self.protocol = protocol
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.macosforge.org/pipermail/calendarserver-changes/attachments/20150408/646c138a/attachment.html>