Commit 5cbe7adb authored by Simon Marlow's avatar Simon Marlow

Fix some more shutdown races

There were races between workerTaskStop() and freeTaskManager(): we
need to be sure that all Tasks have exited properly before we start
tearing things down.  This isn't completely straighforward, see
comments for details.
parent 33189c69
......@@ -587,18 +587,20 @@ rts_unlock (Capability *cap)
task = cap->running_task;
ASSERT_FULL_CAPABILITY_INVARIANTS(cap,task);
// slightly delicate ordering of operations below, pay attention!
// We are no longer a bound task/thread. This is important,
// because the GC can run when we release the Capability below,
// and we don't want it to treat this as a live TSO pointer.
task->tso = NULL;
// Now release the Capability. With the capability released, GC
// may happen. NB. does not try to put the current Task on the
// worker queue.
releaseCapability(cap);
// NB. keep cap->lock held while we call boundTaskExiting(). This
// is necessary during shutdown, where we want the invariant that
// after shutdownCapability(), all the Tasks associated with the
// Capability have completed their shutdown too. Otherwise we
// could have boundTaskExiting()/workerTaskStop() running at some
// random point in the future, which causes problems for
// freeTaskManager().
ACQUIRE_LOCK(&cap->lock);
releaseCapability_(cap,rtsFalse);
// Finally, we can release the Task to the free list.
boundTaskExiting(task);
RELEASE_LOCK(&cap->lock);
}
......@@ -2016,9 +2016,22 @@ workerStart(Task *task)
// schedule() runs without a lock.
cap = schedule(cap,task);
// On exit from schedule(), we have a Capability.
releaseCapability(cap);
// On exit from schedule(), we have a Capability, but possibly not
// the same one we started with.
// During shutdown, the requirement is that after all the
// Capabilities are shut down, all workers that are shutting down
// have finished workerTaskStop(). This is why we hold on to
// cap->lock until we've finished workerTaskStop() below.
//
// There may be workers still involved in foreign calls; those
// will just block in waitForReturnCapability() because the
// Capability has been shut down.
//
ACQUIRE_LOCK(&cap->lock);
releaseCapability_(cap,rtsFalse);
workerTaskStop(task);
RELEASE_LOCK(&cap->lock);
}
#endif
......@@ -2121,7 +2134,6 @@ exitScheduler(
shutdownCapability(&capabilities[i], task, wait_foreign);
}
boundTaskExiting(task);
stopTaskManager();
}
#endif
}
......@@ -2129,11 +2141,23 @@ exitScheduler(
void
freeScheduler( void )
{
freeCapabilities();
freeTaskManager();
if (n_capabilities != 1) {
stgFree(capabilities);
nat still_running;
ACQUIRE_LOCK(&sched_mutex);
still_running = freeTaskManager();
// We can only free the Capabilities if there are no Tasks still
// running. We might have a Task about to return from a foreign
// call into waitForReturnCapability(), for example (actually,
// this should be the *only* thing that a still-running Task can
// do at this point, and it will block waiting for the
// Capability).
if (still_running == 0) {
freeCapabilities();
if (n_capabilities != 1) {
stgFree(capabilities);
}
}
RELEASE_LOCK(&sched_mutex);
#if defined(THREADED_RTS)
closeMutex(&sched_mutex);
#endif
......
......@@ -64,25 +64,16 @@ initTaskManager (void)
}
}
void
stopTaskManager (void)
{
debugTrace(DEBUG_sched,
"stopping task manager, %d tasks still running",
tasksRunning);
/* nothing to do */
}
void
nat
freeTaskManager (void)
{
Task *task, *next;
debugTrace(DEBUG_sched, "freeing task manager");
ASSERT_LOCK_HELD(&sched_mutex);
debugTrace(DEBUG_sched, "freeing task manager, %d tasks still running",
tasksRunning);
ACQUIRE_LOCK(&sched_mutex);
for (task = all_tasks; task != NULL; task = next) {
next = task->all_link;
if (task->stopped) {
......@@ -102,7 +93,8 @@ freeTaskManager (void)
#if defined(THREADED_RTS)
freeThreadLocalKey(&currentTaskKey);
#endif
RELEASE_LOCK(&sched_mutex);
return tasksRunning;
}
......@@ -149,7 +141,6 @@ newTask (void)
all_tasks = task;
taskCount++;
workerCount++;
return task;
}
......@@ -185,6 +176,7 @@ newBoundTask (void)
void
boundTaskExiting (Task *task)
{
task->tso = NULL;
task->stopped = rtsTrue;
task->cap = NULL;
......@@ -218,7 +210,11 @@ discardTask (Task *task)
if (!task->stopped) {
debugTrace(DEBUG_sched, "discarding task %ld", (long)TASK_ID(task));
task->cap = NULL;
task->tso = NULL;
if (task->tso == NULL) {
workerCount--;
} else {
task->tso = NULL;
}
task->stopped = rtsTrue;
tasksRunning--;
task->next = task_free_list;
......@@ -263,6 +259,7 @@ workerTaskStop (Task *task)
taskTimeStamp(task);
task->stopped = rtsTrue;
tasksRunning--;
workerCount--;
ACQUIRE_LOCK(&sched_mutex);
task->next = task_free_list;
......
......@@ -169,8 +169,7 @@ extern Task *all_tasks;
// Requires: sched_mutex.
//
void initTaskManager (void);
void stopTaskManager (void);
void freeTaskManager (void);
nat freeTaskManager (void);
// Create a new Task for a bound thread
// Requires: sched_mutex.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment