[CalendarServer-changes] [14741] twext/trunk/twext/enterprise
source_changes at macosforge.org
source_changes at macosforge.org
Mon May 4 11:26:40 PDT 2015
Revision: 14741
http://trac.calendarserver.org//changeset/14741
Author: cdaboo at apple.com
Date: 2015-05-04 11:26:40 -0700 (Mon, 04 May 2015)
Log Message:
-----------
Better error handling for failures during the main job loop.
Modified Paths:
--------------
twext/trunk/twext/enterprise/jobqueue.py
twext/trunk/twext/enterprise/test/test_jobqueue.py
Modified: twext/trunk/twext/enterprise/jobqueue.py
===================================================================
--- twext/trunk/twext/enterprise/jobqueue.py 2015-05-04 16:56:50 UTC (rev 14740)
+++ twext/trunk/twext/enterprise/jobqueue.py 2015-05-04 18:26:40 UTC (rev 14741)
@@ -2102,6 +2102,7 @@
nowTime = datetime.utcfromtimestamp(self.reactor.seconds())
txn = self.transactionFactory(label="jobqueue.workCheck")
+ nextJob = None
try:
nextJob = yield JobItem.nextjob(txn, nowTime, minPriority)
if nextJob is None:
@@ -2137,13 +2138,49 @@
loopCounter += 1
except Exception as e:
- log.error("Failed to pick a new job, {exc}", exc=e)
+ log.error(
+ "Failed to pick a new job: {jobID}, {exc}",
+ jobID=nextJob.jobID if nextJob else "?",
+ exc=e,
+ )
yield txn.abort()
txn = None
- nextJob = None
+
+ # If we can identify the problem job, try and set it to failed so that it
+ # won't block other jobs behind it (it will be picked again when the failure
+ # interval is exceeded - but that has a back off so a permanently stuck item
+ # should fade away. We probably want to have some additional logic to simply
+ # remove something that is permanently failing.
+ if nextJob is not None:
+ txn = self.transactionFactory(label="jobqueue.workCheck.failed")
+ try:
+ failedJob = yield JobItem.load(txn, nextJob.jobID)
+ yield failedJob.failedToRun()
+ except Exception as e:
+ # Could not mark as failed - break out of the next job loop
+ log.error(
+ "Failed to mark failed new job:{}, {exc}",
+ jobID=nextJob.jobID,
+ exc=e,
+ )
+ yield txn.abort()
+ txn = None
+ nextJob = None
+ break
+ else:
+ # Marked the problem one as failed, so keep going and get the next job
+ log.error("Marked failed new job: {jobID}", jobID=nextJob.jobID)
+ yield txn.commit()
+ txn = None
+ nextJob = None
+ else:
+ # Cannot mark anything as failed - break out of next job loop
+ log.error("Cannot mark failed new job")
+ break
finally:
if txn:
yield txn.commit()
+ txn = None
if nextJob is not None:
try:
Modified: twext/trunk/twext/enterprise/test/test_jobqueue.py
===================================================================
--- twext/trunk/twext/enterprise/test/test_jobqueue.py 2015-05-04 16:56:50 UTC (rev 14740)
+++ twext/trunk/twext/enterprise/test/test_jobqueue.py 2015-05-04 18:26:40 UTC (rev 14741)
@@ -1059,7 +1059,7 @@
@inlineCallbacks
def test_temporaryFailure(self):
"""
- When a work item temporARILY fails it should appear as unassigned in the JOB
+ When a work item temporarily fails it should appear as unassigned in the JOB
table and have the failure count bumped, and a notBefore set to the temporary delay.
"""
dbpool, _ignore_qpool, clock, _ignore_performerChosen = self._setupPools()
@@ -1090,7 +1090,160 @@
self.assertTrue(jobs[0].notBefore > datetime.datetime.utcnow() + datetime.timedelta(seconds=90))
+ @inlineCallbacks
+ def test_loopFailure_noRecovery(self):
+ """
+ When L{_workCheck} fails in its loop we need the problem job marked as failed.
+ """
+ dbpool, _ignore_qpool, clock, _ignore_performerChosen = self._setupPools()
+ fakeNow = datetime.datetime(2012, 12, 12, 12, 12, 12)
+ oldNextJob = JobItem.nextjob
+ @inlineCallbacks
+ def _nextJob(cls, txn, now, minPriority):
+ job = yield oldNextJob(txn, now, minPriority)
+ work = yield job.workItem()
+ if work.a == -2:
+ raise ValueError("oops")
+
+ self.patch(JobItem, "nextjob", classmethod(_nextJob))
+
+ # Let's create a couple of work items directly, not via the enqueue
+ # method, so that they exist but nobody will try to immediately execute
+ # them.
+
+ @transactionally(dbpool.pool.connection)
+ @inlineCallbacks
+ def setup(txn):
+ # Failing
+ yield DummyWorkItem.makeJob(
+ txn, a=-2, b=1, notBefore=fakeNow - datetime.timedelta(20 * 60)
+ )
+ # OK
+ yield DummyWorkItem.makeJob(
+ txn, a=1, b=0, notBefore=fakeNow - datetime.timedelta(20 * 60, 5)
+ )
+ yield setup
+ clock.advance(20 - 12)
+
+ @transactionally(dbpool.pool.connection)
+ def check(txn):
+ return JobItem.all(txn)
+
+ jobs = yield check
+ self.assertEqual(len(jobs), 2)
+ self.assertEqual(jobs[0].assigned, None)
+ self.assertEqual(jobs[0].failed, 0)
+ self.assertEqual(jobs[0].notBefore, fakeNow - datetime.timedelta(20 * 60))
+ self.assertEqual(jobs[1].assigned, None)
+ self.assertEqual(jobs[1].failed, 0)
+ self.assertEqual(jobs[1].notBefore, fakeNow - datetime.timedelta(20 * 60, 5))
+
+
+ @inlineCallbacks
+ def test_loopFailure_recovery(self):
+ """
+ When L{_workCheck} fails in its loop we need the problem job marked as failed.
+ """
+ dbpool, _ignore_qpool, clock, _ignore_performerChosen = self._setupPools()
+ fakeNow = datetime.datetime(2012, 12, 12, 12, 12, 12)
+
+ oldAssign = JobItem.assign
+ @inlineCallbacks
+ def _assign(self, when, overdue):
+ work = yield self.workItem()
+ if work.a == -2:
+ raise ValueError("oops")
+ yield oldAssign(self, when, overdue)
+
+ self.patch(JobItem, "assign", _assign)
+
+ # Let's create a couple of work items directly, not via the enqueue
+ # method, so that they exist but nobody will try to immediately execute
+ # them.
+
+ @transactionally(dbpool.pool.connection)
+ @inlineCallbacks
+ def setup(txn):
+ # Failing
+ yield DummyWorkItem.makeJob(
+ txn, a=-2, b=1, notBefore=fakeNow - datetime.timedelta(20 * 60)
+ )
+ # OK
+ yield DummyWorkItem.makeJob(
+ txn, a=1, b=0, notBefore=fakeNow - datetime.timedelta(20 * 60, 5)
+ )
+ yield setup
+ clock.advance(20 - 12)
+
+ @transactionally(dbpool.pool.connection)
+ def check(txn):
+ return JobItem.all(txn)
+
+ jobs = yield check
+ self.assertEqual(len(jobs), 1)
+ self.assertEqual(jobs[0].assigned, None)
+ self.assertEqual(jobs[0].failed, 1)
+ self.assertGreater(jobs[0].notBefore, datetime.datetime.utcnow() + datetime.timedelta(seconds=30))
+
+
+ @inlineCallbacks
+ def test_loopFailure_failedRecovery(self):
+ """
+ When L{_workCheck} fails in its loop we need the problem job marked as failed.
+ """
+ dbpool, _ignore_qpool, clock, _ignore_performerChosen = self._setupPools()
+ fakeNow = datetime.datetime(2012, 12, 12, 12, 12, 12)
+
+ oldAssign = JobItem.assign
+ @inlineCallbacks
+ def _assign(self, when, overdue):
+ work = yield self.workItem()
+ if work.a == -2:
+ raise ValueError("oops")
+ yield oldAssign(self, when, overdue)
+
+ self.patch(JobItem, "assign", _assign)
+
+ @inlineCallbacks
+ def _failedToRun(self, locked=False, delay=None):
+ raise ValueError("oops")
+
+ self.patch(JobItem, "failedToRun", _failedToRun)
+
+ # Let's create a couple of work items directly, not via the enqueue
+ # method, so that they exist but nobody will try to immediately execute
+ # them.
+
+ @transactionally(dbpool.pool.connection)
+ @inlineCallbacks
+ def setup(txn):
+ # Failing
+ yield DummyWorkItem.makeJob(
+ txn, a=-2, b=1, notBefore=fakeNow - datetime.timedelta(20 * 60)
+ )
+ # OK
+ yield DummyWorkItem.makeJob(
+ txn, a=1, b=0, notBefore=fakeNow - datetime.timedelta(20 * 60, 5)
+ )
+ yield setup
+ clock.advance(20 - 12)
+
+ @transactionally(dbpool.pool.connection)
+ def check(txn):
+ return JobItem.all(txn)
+
+ jobs = yield check
+ self.assertEqual(len(jobs), 2)
+ self.assertEqual(jobs[0].assigned, None)
+ self.assertEqual(jobs[0].failed, 0)
+ self.assertEqual(jobs[0].notBefore, fakeNow - datetime.timedelta(20 * 60))
+ self.assertEqual(jobs[1].assigned, None)
+ self.assertEqual(jobs[1].failed, 0)
+ self.assertEqual(jobs[1].notBefore, fakeNow - datetime.timedelta(20 * 60, 5))
+
+
+
class HalfConnection(object):
def __init__(self, protocol):
self.protocol = protocol
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.macosforge.org/pipermail/calendarserver-changes/attachments/20150504/59d0efed/attachment.html>
More information about the calendarserver-changes
mailing list