From a6ade8378d7dceccd91aff82f7fd3ddf3490e5df Mon Sep 17 00:00:00 2001
From: Ayke van Laethem <aykevanlaethem@gmail.com>
Date: Tue, 10 Jun 2025 17:10:48 +0200
Subject: [PATCH 1/5] sync: fix TestMutexConcurrent test

Accessing the same variable from multiple goroutines is unsafe, and will
fail with parallelism. A lightweight way to avoid issues is by using
atomic variables.
---
 src/sync/mutex_test.go | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/src/sync/mutex_test.go b/src/sync/mutex_test.go
index be24d93031..4cb33b9596 100644
--- a/src/sync/mutex_test.go
+++ b/src/sync/mutex_test.go
@@ -58,9 +58,9 @@ func TestMutexUncontended(t *testing.T) {
 // It will fail if multiple goroutines hold the lock simultaneously.
 func TestMutexConcurrent(t *testing.T) {
 	var mu sync.Mutex
-	var active uint
-	var completed uint
-	ok := true
+	var active atomic.Uint32
+	var completed atomic.Uint32
+	var fail atomic.Uint32
 
 	const n = 10
 	for i := 0; i < n; i++ {
@@ -74,11 +74,11 @@ func TestMutexConcurrent(t *testing.T) {
 			mu.Lock()
 
 			// Increment the active counter.
-			active++
+			nowActive := active.Add(1)
 
-			if active > 1 {
+			if nowActive > 1 {
 				// Multiple things are holding the lock at the same time.
-				ok = false
+				fail.Store(1)
 			} else {
 				// Delay a bit.
 				for k := j; k < n; k++ {
@@ -87,10 +87,11 @@ func TestMutexConcurrent(t *testing.T) {
 			}
 
 			// Decrement the active counter.
-			active--
+			var one = 1
+			active.Add(uint32(-one))
 
 			// This is completed.
-			completed++
+			completed.Add(1)
 
 			mu.Unlock()
 		}()
@@ -104,10 +105,10 @@ func TestMutexConcurrent(t *testing.T) {
 
 		// Acquire the lock and check whether everything has completed.
 		mu.Lock()
-		done = completed == n
+		done = completed.Load() == n
 		mu.Unlock()
 	}
-	if !ok {
+	if fail.Load() != 0 {
 		t.Error("lock held concurrently")
 	}
 }

From 28f70fcddbb3993dba3f7c00a3b5361fae0b1990 Mon Sep 17 00:00:00 2001
From: Ayke van Laethem <aykevanlaethem@gmail.com>
Date: Tue, 10 Jun 2025 18:44:32 +0200
Subject: [PATCH 2/5] runtime: add exportedFuncPtr

This function isn't used yet, but will be used for the "cores" scheduler
for RISC-V.
---
 compiler/compiler.go   | 24 +++++++++++++++---------
 src/runtime/runtime.go |  5 +++++
 2 files changed, 20 insertions(+), 9 deletions(-)

diff --git a/compiler/compiler.go b/compiler/compiler.go
index 4870df7803..8305616959 100644
--- a/compiler/compiler.go
+++ b/compiler/compiler.go
@@ -1856,15 +1856,7 @@ func (b *builder) createBuiltin(argTypes []types.Type, argValues []llvm.Value, c
 //
 // This is also where compiler intrinsics are implemented.
 func (b *builder) createFunctionCall(instr *ssa.CallCommon) (llvm.Value, error) {
-	var params []llvm.Value
-	for _, param := range instr.Args {
-		params = append(params, b.getValue(param, getPos(instr)))
-	}
-
-	// Try to call the function directly for trivially static calls.
-	var callee, context llvm.Value
-	var calleeType llvm.Type
-	exported := false
+	// See if this is an intrinsic function that is handled specially.
 	if fn := instr.StaticCallee(); fn != nil {
 		// Direct function call, either to a named or anonymous (directly
 		// applied) function call. If it is anonymous, it may be a closure.
@@ -1900,13 +1892,27 @@ func (b *builder) createFunctionCall(instr *ssa.CallCommon) (llvm.Value, error)
 			return llvm.ConstInt(b.ctx.Int8Type(), panicStrategy, false), nil
 		case name == "runtime/interrupt.New":
 			return b.createInterruptGlobal(instr)
+		case name == "runtime.exportedFuncPtr":
+			_, ptr := b.getFunction(instr.Args[0].(*ssa.Function))
+			return b.CreatePtrToInt(ptr, b.uintptrType, ""), nil
 		case name == "internal/abi.FuncPCABI0":
 			retval := b.createDarwinFuncPCABI0Call(instr)
 			if !retval.IsNil() {
 				return retval, nil
 			}
 		}
+	}
 
+	var params []llvm.Value
+	for _, param := range instr.Args {
+		params = append(params, b.getValue(param, getPos(instr)))
+	}
+
+	// Try to call the function directly for trivially static calls.
+	var callee, context llvm.Value
+	var calleeType llvm.Type
+	exported := false
+	if fn := instr.StaticCallee(); fn != nil {
 		calleeType, callee = b.getFunction(fn)
 		info := b.getFunctionInfo(fn)
 		if callee.IsNil() {
diff --git a/src/runtime/runtime.go b/src/runtime/runtime.go
index 2dcf313ff7..6bd719f548 100644
--- a/src/runtime/runtime.go
+++ b/src/runtime/runtime.go
@@ -63,6 +63,11 @@ func strlen(ptr unsafe.Pointer) uintptr
 //export malloc
 func malloc(size uintptr) unsafe.Pointer
 
+// Return the address of an exported function.
+// This is mainly useful to pass a function pointer without extra context
+// parameter to C, for example.
+func exportedFuncPtr(fn func()) uintptr
+
 // Compare two same-size buffers for equality.
 func memequal(x, y unsafe.Pointer, n uintptr) bool {
 	for i := uintptr(0); i < n; i++ {

From 3ce840e3915c57901ebaf78b6e7ca7834f3b6fe7 Mon Sep 17 00:00:00 2001
From: Ayke van Laethem <aykevanlaethem@gmail.com>
Date: Tue, 10 Jun 2025 19:43:04 +0200
Subject: [PATCH 3/5] runtime/interrupt: add Checkpoint type

This type can be used to jump back to a previous position in a program
from inside an interrupt. This is useful for baremetal systems that
implement wfi but not wfe, and therefore have no easy (race-free) way to
wait until a flag gets changed inside an interrupt. This is an issue on
RISC-V, where this is racy (the interrupt might happen after the check
but before the wfi instruction):

    configureInterrupt()
    for flag.Load() != 0 {
        riscv.Asm("wfi")
    }
---
 compiler/compiler.go                |  2 +
 compiler/defer.go                   | 21 +++++++---
 compiler/inlineasm.go               | 12 ++++++
 src/runtime/asm_riscv.S             |  9 +++++
 src/runtime/interrupt/checkpoint.go | 62 +++++++++++++++++++++++++++++
 5 files changed, 100 insertions(+), 6 deletions(-)
 create mode 100644 src/runtime/interrupt/checkpoint.go

diff --git a/compiler/compiler.go b/compiler/compiler.go
index 8305616959..61d7e89335 100644
--- a/compiler/compiler.go
+++ b/compiler/compiler.go
@@ -1895,6 +1895,8 @@ func (b *builder) createFunctionCall(instr *ssa.CallCommon) (llvm.Value, error)
 		case name == "runtime.exportedFuncPtr":
 			_, ptr := b.getFunction(instr.Args[0].(*ssa.Function))
 			return b.CreatePtrToInt(ptr, b.uintptrType, ""), nil
+		case name == "(*runtime/interrupt.Checkpoint).Save":
+			return b.createInterruptCheckpoint(instr.Args[0]), nil
 		case name == "internal/abi.FuncPCABI0":
 			retval := b.createDarwinFuncPCABI0Call(instr)
 			if !retval.IsNil() {
diff --git a/compiler/defer.go b/compiler/defer.go
index 2ca76a8325..e2944456df 100644
--- a/compiler/defer.go
+++ b/compiler/defer.go
@@ -103,10 +103,11 @@ func (b *builder) createLandingPad() {
 	b.CreateBr(b.blockEntries[b.fn.Recover])
 }
 
-// createInvokeCheckpoint saves the function state at the given point, to
-// continue at the landing pad if a panic happened. This is implemented using a
-// setjmp-like construct.
-func (b *builder) createInvokeCheckpoint() {
+// Create a checkpoint (similar to setjmp). This emits inline assembly that
+// stores the current program counter inside the ptr address (actually
+// ptr+sizeof(ptr)) and then returns a boolean indicating whether this is the
+// normal flow (false) or we jumped here from somewhere else (true).
+func (b *builder) createCheckpoint(ptr llvm.Value) llvm.Value {
 	// Construct inline assembly equivalents of setjmp.
 	// The assembly works as follows:
 	//   * All registers (both callee-saved and caller saved) are clobbered
@@ -217,11 +218,19 @@ li a0, 0
 		// This case should have been handled by b.supportsRecover().
 		b.addError(b.fn.Pos(), "unknown architecture for defer: "+b.archFamily())
 	}
-	asmType := llvm.FunctionType(resultType, []llvm.Type{b.deferFrame.Type()}, false)
+	asmType := llvm.FunctionType(resultType, []llvm.Type{b.dataPtrType}, false)
 	asm := llvm.InlineAsm(asmType, asmString, constraints, false, false, 0, false)
-	result := b.CreateCall(asmType, asm, []llvm.Value{b.deferFrame}, "setjmp")
+	result := b.CreateCall(asmType, asm, []llvm.Value{ptr}, "setjmp")
 	result.AddCallSiteAttribute(-1, b.ctx.CreateEnumAttribute(llvm.AttributeKindID("returns_twice"), 0))
 	isZero := b.CreateICmp(llvm.IntEQ, result, llvm.ConstInt(resultType, 0, false), "setjmp.result")
+	return isZero
+}
+
+// createInvokeCheckpoint saves the function state at the given point, to
+// continue at the landing pad if a panic happened. This is implemented using a
+// setjmp-like construct.
+func (b *builder) createInvokeCheckpoint() {
+	isZero := b.createCheckpoint(b.deferFrame)
 	continueBB := b.insertBasicBlock("")
 	b.CreateCondBr(isZero, continueBB, b.landingpad)
 	b.SetInsertPointAtEnd(continueBB)
diff --git a/compiler/inlineasm.go b/compiler/inlineasm.go
index 5e54b3be60..237c430abe 100644
--- a/compiler/inlineasm.go
+++ b/compiler/inlineasm.go
@@ -249,3 +249,15 @@ func (b *builder) emitCSROperation(call *ssa.CallCommon) (llvm.Value, error) {
 		return llvm.Value{}, b.makeError(call.Pos(), "unknown CSR operation: "+name)
 	}
 }
+
+// Implement runtime/interrupt.Checkpoint.Save. It needs to be implemented
+// directly at the call site. If it isn't implemented directly at the call site
+// (but instead through a function call), it might result in an overwritten
+// stack in the non-jump return case.
+func (b *builder) createInterruptCheckpoint(ptr ssa.Value) llvm.Value {
+	addr := b.getValue(ptr, ptr.Pos())
+	b.createNilCheck(ptr, addr, "deref")
+	stackPointer := b.readStackPointer()
+	b.CreateStore(stackPointer, addr)
+	return b.createCheckpoint(addr)
+}
diff --git a/src/runtime/asm_riscv.S b/src/runtime/asm_riscv.S
index c028d6d53a..9cf6571850 100644
--- a/src/runtime/asm_riscv.S
+++ b/src/runtime/asm_riscv.S
@@ -50,3 +50,12 @@ tinygo_longjmp:
     lw sp, 0(a0)       // jumpSP
     lw a1, REGSIZE(a0) // jumpPC
     jr a1
+
+.section .text.tinygo_checkpointJump
+.global tinygo_checkpointJump
+tinygo_checkpointJump:
+    // Note: the code we jump to assumes a0 is non-zero, which is already the
+    // case because that's the stack pointer.
+    mv sp, a0          // jumpSP
+    csrw mepc, a1      // update MEPC value, so we resume there after the mret
+    mret               // jump to jumpPC
diff --git a/src/runtime/interrupt/checkpoint.go b/src/runtime/interrupt/checkpoint.go
new file mode 100644
index 0000000000..f02a2bead1
--- /dev/null
+++ b/src/runtime/interrupt/checkpoint.go
@@ -0,0 +1,62 @@
+package interrupt
+
+// A checkpoint is a setjmp like buffer, that can be used as a flag for
+// interrupts.
+//
+// It can be used as follows:
+//
+//	// global var
+//	var c Checkpoint
+//
+//	// to set up the checkpoint and wait for it
+//	if c.Save() {
+//		setupInterrupt()
+//		for {
+//			waitForInterrupt()
+//		}
+//	}
+//
+//	// Inside the interrupt handler:
+//	if c.Saved() {
+//		c.Jump()
+//	}
+type Checkpoint struct {
+	jumpSP uintptr
+	jumpPC uintptr
+}
+
+// Save the execution state in the given checkpoint, overwriting a previous
+// saved checkpoint.
+//
+// This function returns twice: once the normal way after saving (returning
+// true) and once after jumping (returning false).
+//
+// This function is a compiler intrinsic, it is not implemented in Go.
+func (c *Checkpoint) Save() bool
+
+// Returns whether a jump point was saved (and not erased due to a jump).
+func (c *Checkpoint) Saved() bool {
+	return c.jumpPC != 0
+}
+
+// Jump to the point where the execution state was saved, and erase the saved
+// jump point. This must *only* be called from inside an interrupt.
+//
+// This method does not return in the conventional way, it resumes execution at
+// the last point a checkpoint was saved.
+func (c *Checkpoint) Jump() {
+	if !c.Saved() {
+		panic("runtime/interrupt: no checkpoint was saved")
+	}
+	jumpPC := c.jumpPC
+	jumpSP := c.jumpSP
+	c.jumpPC = 0
+	c.jumpSP = 0
+	if jumpPC == 0 {
+		panic("jumping to 0")
+	}
+	checkpointJump(jumpSP, jumpPC)
+}
+
+//export tinygo_checkpointJump
+func checkpointJump(jumpSP, jumpPC uintptr)

From befd1fb40201c59c96c69c1b0c8d6597c2678e93 Mon Sep 17 00:00:00 2001
From: Ayke van Laethem <aykevanlaethem@gmail.com>
Date: Wed, 11 Jun 2025 10:26:52 +0200
Subject: [PATCH 4/5] runtime: refactor obtaining the system stack

The system stack is only needed when we're not on it. So we can directly
call task.SystemStack() without problems.

This also saves a tiny bit of binary size.
---
 builder/sizes_test.go          |  6 +++---
 src/internal/task/task_none.go |  6 ++++++
 src/runtime/gc_stack_raw.go    |  2 +-
 src/runtime/scheduler_none.go  |  6 ------
 src/runtime/scheduler_tasks.go | 17 -----------------
 5 files changed, 10 insertions(+), 27 deletions(-)
 delete mode 100644 src/runtime/scheduler_tasks.go

diff --git a/builder/sizes_test.go b/builder/sizes_test.go
index 931e9970e0..6009810c0c 100644
--- a/builder/sizes_test.go
+++ b/builder/sizes_test.go
@@ -42,9 +42,9 @@ func TestBinarySize(t *testing.T) {
 	// This is a small number of very diverse targets that we want to test.
 	tests := []sizeTest{
 		// microcontrollers
-		{"hifive1b", "examples/echo", 4560, 280, 0, 2268},
-		{"microbit", "examples/serial", 2924, 388, 8, 2272},
-		{"wioterminal", "examples/pininterrupt", 7383, 1489, 116, 6912},
+		{"hifive1b", "examples/echo", 4556, 280, 0, 2268},
+		{"microbit", "examples/serial", 2920, 388, 8, 2272},
+		{"wioterminal", "examples/pininterrupt", 7379, 1489, 116, 6912},
 
 		// TODO: also check wasm. Right now this is difficult, because
 		// wasm binaries are run through wasm-opt and therefore the
diff --git a/src/internal/task/task_none.go b/src/internal/task/task_none.go
index 280f1c4a81..60bd867aeb 100644
--- a/src/internal/task/task_none.go
+++ b/src/internal/task/task_none.go
@@ -36,3 +36,9 @@ func OnSystemStack() bool {
 	// This scheduler does not do any stack switching.
 	return true
 }
+
+func SystemStack() uintptr {
+	// System stack is the current stack, so this shouldn't be called.
+	runtimePanic("scheduler is disabled")
+	return 0 // unreachable
+}
diff --git a/src/runtime/gc_stack_raw.go b/src/runtime/gc_stack_raw.go
index 94cb5e43b2..5c302b1f11 100644
--- a/src/runtime/gc_stack_raw.go
+++ b/src/runtime/gc_stack_raw.go
@@ -20,7 +20,7 @@ func markStack() {
 
 	if !task.OnSystemStack() {
 		// Mark system stack.
-		markRoots(getSystemStackPointer(), stackTop)
+		markRoots(task.SystemStack(), stackTop)
 	}
 }
 
diff --git a/src/runtime/scheduler_none.go b/src/runtime/scheduler_none.go
index 3f88e03ebf..1539739714 100644
--- a/src/runtime/scheduler_none.go
+++ b/src/runtime/scheduler_none.go
@@ -73,9 +73,3 @@ func scheduler(returnAtDeadlock bool) {
 	// this code should be unreachable.
 	runtimePanic("unreachable: scheduler must not be called with the 'none' scheduler")
 }
-
-// getSystemStackPointer returns the current stack pointer of the system stack.
-// This is always the current stack pointer.
-func getSystemStackPointer() uintptr {
-	return getCurrentStackPointer()
-}
diff --git a/src/runtime/scheduler_tasks.go b/src/runtime/scheduler_tasks.go
deleted file mode 100644
index 6ee540fd35..0000000000
--- a/src/runtime/scheduler_tasks.go
+++ /dev/null
@@ -1,17 +0,0 @@
-//go:build scheduler.tasks
-
-package runtime
-
-import "internal/task"
-
-// getSystemStackPointer returns the current stack pointer of the system stack.
-// This is not necessarily the same as the current stack pointer.
-func getSystemStackPointer() uintptr {
-	// TODO: this always returns the correct stack on Cortex-M, so don't bother
-	// comparing against 0.
-	sp := task.SystemStack()
-	if sp == 0 {
-		sp = getCurrentStackPointer()
-	}
-	return sp
-}

From f21b67c089dd80120729d9bda452ee035a58b109 Mon Sep 17 00:00:00 2001
From: Ayke van Laethem <aykevanlaethem@gmail.com>
Date: Thu, 3 Apr 2025 13:29:55 +0200
Subject: [PATCH 5/5] all: add support for multicore scheduler

This commit adds support for a scheduler that runs a scheduler on all
available cores. It is meant to be used on baremetal systems with a
fixed number of cores, such as the RP2040.

The initial implementation adds support for multicore scheduling to the
riscv-qemu target as a convenient testing target. This means that this
new multicore scheduler is tested in CI, including a bunch of standard
library tests (`make tinygo-test-baremetal`). This should ensure the new
scheduler is reasonably well tested before trying to use it on
harder-to-debug targets like the RP2040.
---
 builder/sizes.go                              |   2 +-
 builder/sizes_test.go                         |   2 +-
 compileopts/config.go                         |   5 +
 compileopts/options.go                        |   2 +-
 compileopts/options_test.go                   |   2 +-
 src/device/riscv/start.S                      |  39 ++
 src/internal/task/atomic-cooperative.go       |   2 +-
 src/internal/task/atomic-preemptive.go        |   2 +-
 src/internal/task/futex-cooperative.go        |   2 +-
 src/internal/task/futex-cores.go              |  64 +++
 .../{futex-preemptive.go => futex-threads.go} |   0
 src/internal/task/mutex-cooperative.go        |   2 +-
 src/internal/task/mutex-preemptive.go         |   2 +-
 src/internal/task/pmutex-cooperative.go       |   2 +-
 src/internal/task/pmutex-preemptive.go        |   2 +-
 src/internal/task/queue.go                    |  47 +-
 src/internal/task/task.go                     |  17 +
 src/internal/task/task_stack.go               |  35 +-
 src/internal/task/task_stack_multicore.go     |  53 +++
 src/internal/task/task_stack_tinygoriscv.go   |  19 +-
 src/internal/task/task_stack_unicore.go       |  37 ++
 src/runtime/atomics_critical.go               |  61 ++-
 src/runtime/gc_stack_cores.go                 | 101 +++++
 src/runtime/gc_stack_raw.go                   |  10 +-
 src/runtime/panic.go                          |   3 +-
 src/runtime/print.go                          |  13 -
 src/runtime/runtime_tinygoriscv_qemu.go       | 407 ++++++++++++++++--
 src/runtime/scheduler_cooperative.go          |  16 +
 src/runtime/scheduler_cores.go                | 317 ++++++++++++++
 src/runtime/scheduler_none.go                 |  21 +-
 src/runtime/scheduler_tasks.go                |  13 +
 src/runtime/scheduler_threads.go              |  32 +-
 targets/riscv-qemu.json                       |  21 +-
 targets/riscv.ld                              |  23 +-
 .../gen-critical-atomics.go                   |  21 +-
 35 files changed, 1237 insertions(+), 160 deletions(-)
 create mode 100644 src/internal/task/futex-cores.go
 rename src/internal/task/{futex-preemptive.go => futex-threads.go} (100%)
 create mode 100644 src/internal/task/task_stack_multicore.go
 create mode 100644 src/internal/task/task_stack_unicore.go
 create mode 100644 src/runtime/gc_stack_cores.go
 create mode 100644 src/runtime/scheduler_cores.go
 create mode 100644 src/runtime/scheduler_tasks.go

diff --git a/builder/sizes.go b/builder/sizes.go
index 485a652d97..57fb36df67 100644
--- a/builder/sizes.go
+++ b/builder/sizes.go
@@ -490,7 +490,7 @@ func loadProgramSize(path string, packagePathMap map[string]string) (*programSiz
 				continue
 			}
 			if section.Type == elf.SHT_NOBITS {
-				if section.Name == ".stack" {
+				if strings.HasPrefix(section.Name, ".stack") {
 					// TinyGo emits stack sections on microcontroller using the
 					// ".stack" name.
 					// This is a bit ugly, but I don't think there is a way to
diff --git a/builder/sizes_test.go b/builder/sizes_test.go
index 6009810c0c..8184040648 100644
--- a/builder/sizes_test.go
+++ b/builder/sizes_test.go
@@ -42,7 +42,7 @@ func TestBinarySize(t *testing.T) {
 	// This is a small number of very diverse targets that we want to test.
 	tests := []sizeTest{
 		// microcontrollers
-		{"hifive1b", "examples/echo", 4556, 280, 0, 2268},
+		{"hifive1b", "examples/echo", 4556, 280, 0, 2264},
 		{"microbit", "examples/serial", 2920, 388, 8, 2272},
 		{"wioterminal", "examples/pininterrupt", 7379, 1489, 116, 6912},
 
diff --git a/compileopts/config.go b/compileopts/config.go
index d05111f2b0..e1fb27f66e 100644
--- a/compileopts/config.go
+++ b/compileopts/config.go
@@ -110,6 +110,11 @@ func (c *Config) BuildTags() []string {
 		"math_big_pure_go",                           // to get math/big to work
 		"gc." + c.GC(), "scheduler." + c.Scheduler(), // used inside the runtime package
 		"serial." + c.Serial()}...) // used inside the machine package
+	switch c.Scheduler() {
+	case "threads", "cores":
+	default:
+		tags = append(tags, "tinygo.unicore")
+	}
 	for i := 1; i <= c.GoMinorVersion; i++ {
 		tags = append(tags, fmt.Sprintf("go1.%d", i))
 	}
diff --git a/compileopts/options.go b/compileopts/options.go
index ddad0b8795..517664db2c 100644
--- a/compileopts/options.go
+++ b/compileopts/options.go
@@ -10,7 +10,7 @@ import (
 var (
 	validBuildModeOptions     = []string{"default", "c-shared", "wasi-legacy"}
 	validGCOptions            = []string{"none", "leaking", "conservative", "custom", "precise", "boehm"}
-	validSchedulerOptions     = []string{"none", "tasks", "asyncify", "threads"}
+	validSchedulerOptions     = []string{"none", "tasks", "asyncify", "threads", "cores"}
 	validSerialOptions        = []string{"none", "uart", "usb", "rtt"}
 	validPrintSizeOptions     = []string{"none", "short", "full", "html"}
 	validPanicStrategyOptions = []string{"print", "trap"}
diff --git a/compileopts/options_test.go b/compileopts/options_test.go
index e75c10d767..dd098e6c4a 100644
--- a/compileopts/options_test.go
+++ b/compileopts/options_test.go
@@ -10,7 +10,7 @@ import (
 func TestVerifyOptions(t *testing.T) {
 
 	expectedGCError := errors.New(`invalid gc option 'incorrect': valid values are none, leaking, conservative, custom, precise, boehm`)
-	expectedSchedulerError := errors.New(`invalid scheduler option 'incorrect': valid values are none, tasks, asyncify, threads`)
+	expectedSchedulerError := errors.New(`invalid scheduler option 'incorrect': valid values are none, tasks, asyncify, threads, cores`)
 	expectedPrintSizeError := errors.New(`invalid size option 'incorrect': valid values are none, short, full, html`)
 	expectedPanicStrategyError := errors.New(`invalid panic option 'incorrect': valid values are print, trap`)
 
diff --git a/src/device/riscv/start.S b/src/device/riscv/start.S
index 25217b3579..d67d82dc5a 100644
--- a/src/device/riscv/start.S
+++ b/src/device/riscv/start.S
@@ -3,8 +3,47 @@
 .type _start,@function
 
 _start:
+    // If we're on a multicore system, we need to wait for hart 0 to wake us up.
+#if TINYGO_CORES > 1
+    csrr a0, mhartid
+
+    // Hart 0 stack
+    bnez a0, 1f
+    la sp,      _stack_top
+
+1:
+    // Hart 1 stack
+    li a1, 1
+    bne a0, a1, 2f
+    la sp,      _stack1_top
+
+2:
+    // Hart 2 stack
+    #if TINYGO_CORES >= 3
+    li a1, 2
+    bne a0, a1, 3f
+    la sp,      _stack2_top
+    #endif
+
+3:
+    // Hart 3 stack
+    #if TINYGO_CORES >= 4
+    li a1, 3
+    bne a0, a1, 4f
+    la sp,      _stack3_top
+    #endif
+
+4:
+    // done
+
+#if TINYGO_CORES > 4
+#error only up to 4 cores are supported at the moment!
+#endif
+
+#else
     // Load the stack pointer.
     la sp,      _stack_top
+#endif
 
     // Load the globals pointer. The program will load pointers relative to this
     // register, so it must be set to the right value on startup.
diff --git a/src/internal/task/atomic-cooperative.go b/src/internal/task/atomic-cooperative.go
index bd4cba8956..e05ea7de0d 100644
--- a/src/internal/task/atomic-cooperative.go
+++ b/src/internal/task/atomic-cooperative.go
@@ -1,4 +1,4 @@
-//go:build !scheduler.threads
+//go:build tinygo.unicore
 
 package task
 
diff --git a/src/internal/task/atomic-preemptive.go b/src/internal/task/atomic-preemptive.go
index 275f36dce4..b395ef48a3 100644
--- a/src/internal/task/atomic-preemptive.go
+++ b/src/internal/task/atomic-preemptive.go
@@ -1,4 +1,4 @@
-//go:build scheduler.threads
+//go:build !tinygo.unicore
 
 package task
 
diff --git a/src/internal/task/futex-cooperative.go b/src/internal/task/futex-cooperative.go
index 2a42c28d43..ae9efb5a73 100644
--- a/src/internal/task/futex-cooperative.go
+++ b/src/internal/task/futex-cooperative.go
@@ -1,4 +1,4 @@
-//go:build !scheduler.threads
+//go:build tinygo.unicore
 
 package task
 
diff --git a/src/internal/task/futex-cores.go b/src/internal/task/futex-cores.go
new file mode 100644
index 0000000000..9bf493f25c
--- /dev/null
+++ b/src/internal/task/futex-cores.go
@@ -0,0 +1,64 @@
+//go:build scheduler.cores
+
+package task
+
+import "runtime/interrupt"
+
+// A futex is a way for userspace to wait with the pointer as the key, and for
+// another thread to wake one or all waiting threads keyed on the same pointer.
+//
+// A futex does not change the underlying value, it only reads it before to prevent
+// lost wake-ups.
+type Futex struct {
+	Uint32
+
+	waiters Stack
+}
+
+// Atomically check for cmp to still be equal to the futex value and if so, go
+// to sleep. Return true if we were definitely awoken by a call to Wake or
+// WakeAll, and false if we can't be sure of that.
+func (f *Futex) Wait(cmp uint32) (awoken bool) {
+	mask := lockFutex()
+
+	if f.Uint32.Load() != cmp {
+		unlockFutex(mask)
+		return false
+	}
+
+	// Push the current goroutine onto the waiter stack.
+	f.waiters.Push(Current())
+
+	unlockFutex(mask)
+
+	// Pause until this task is awoken by Wake/WakeAll.
+	Pause()
+
+	// We were awoken by a call to Wake or WakeAll. There is no chance for
+	// spurious wakeups.
+	return true
+}
+
+// Wake a single waiter.
+func (f *Futex) Wake() {
+	mask := lockFutex()
+	if t := f.waiters.Pop(); t != nil {
+		scheduleTask(t)
+	}
+	unlockFutex(mask)
+}
+
+// Wake all waiters.
+func (f *Futex) WakeAll() {
+	mask := lockFutex()
+	for t := f.waiters.Pop(); t != nil; t = f.waiters.Pop() {
+		scheduleTask(t)
+	}
+	unlockFutex(mask)
+}
+
+//go:linkname lockFutex runtime.lockFutex
+func lockFutex() interrupt.State
+
+//go:linkname unlockFutex runtime.unlockFutex
+func unlockFutex(interrupt.State)
diff --git a/src/internal/task/futex-preemptive.go b/src/internal/task/futex-threads.go
similarity index 100%
rename from src/internal/task/futex-preemptive.go
rename to src/internal/task/futex-threads.go
diff --git a/src/internal/task/mutex-cooperative.go b/src/internal/task/mutex-cooperative.go
index f1205eea25..90274df2bb 100644
--- a/src/internal/task/mutex-cooperative.go
+++ b/src/internal/task/mutex-cooperative.go
@@ -1,4 +1,4 @@
-//go:build !scheduler.threads
+//go:build tinygo.unicore
 
 package task
 
diff --git a/src/internal/task/mutex-preemptive.go b/src/internal/task/mutex-preemptive.go
index 27f4646698..ec83a6135d 100644
--- a/src/internal/task/mutex-preemptive.go
+++ b/src/internal/task/mutex-preemptive.go
@@ -1,4 +1,4 @@
-//go:build scheduler.threads
+//go:build !tinygo.unicore
 
 package task
 
diff --git a/src/internal/task/pmutex-cooperative.go b/src/internal/task/pmutex-cooperative.go
index 0e6c4f828b..b61e92d829 100644
--- a/src/internal/task/pmutex-cooperative.go
+++ b/src/internal/task/pmutex-cooperative.go
@@ -1,4 +1,4 @@
-//go:build !scheduler.threads
+//go:build tinygo.unicore
 
 package task
 
diff --git a/src/internal/task/pmutex-preemptive.go b/src/internal/task/pmutex-preemptive.go
index 10f0a63561..92263ed256 100644
--- a/src/internal/task/pmutex-preemptive.go
+++ b/src/internal/task/pmutex-preemptive.go
@@ -1,4 +1,4 @@
-//go:build scheduler.threads
+//go:build !tinygo.unicore
 
 package task
 
diff --git a/src/internal/task/queue.go b/src/internal/task/queue.go
index 5a00d95a6f..7242c4e31d 100644
--- a/src/internal/task/queue.go
+++ b/src/internal/task/queue.go
@@ -12,9 +12,9 @@ type Queue struct {
 
 // Push a task onto the queue.
 func (q *Queue) Push(t *Task) {
-	i := interrupt.Disable()
+	mask := lockAtomics()
 	if asserts && t.Next != nil {
-		interrupt.Restore(i)
+		unlockAtomics(mask)
 		panic("runtime: pushing a task to a queue with a non-nil Next pointer")
 	}
 	if q.tail != nil {
@@ -25,15 +25,15 @@ func (q *Queue) Push(t *Task) {
 	if q.head == nil {
 		q.head = t
 	}
-	interrupt.Restore(i)
+	unlockAtomics(mask)
 }
 
 // Pop a task off of the queue.
 func (q *Queue) Pop() *Task {
-	i := interrupt.Disable()
+	mask := lockAtomics()
 	t := q.head
 	if t == nil {
-		interrupt.Restore(i)
+		unlockAtomics(mask)
 		return nil
 	}
 	q.head = t.Next
@@ -41,13 +41,13 @@ func (q *Queue) Pop() *Task {
 		q.tail = nil
 	}
 	t.Next = nil
-	interrupt.Restore(i)
+	unlockAtomics(mask)
 	return t
 }
 
 // Append pops the contents of another queue and pushes them onto the end of this queue.
 func (q *Queue) Append(other *Queue) {
-	i := interrupt.Disable()
+	mask := lockAtomics()
 	if q.head == nil {
 		q.head = other.head
 	} else {
@@ -55,14 +55,14 @@ func (q *Queue) Append(other *Queue) {
 	}
 	q.tail = other.tail
 	other.head, other.tail = nil, nil
-	interrupt.Restore(i)
+	unlockAtomics(mask)
 }
 
 // Empty checks if the queue is empty.
 func (q *Queue) Empty() bool {
-	i := interrupt.Disable()
+	mask := lockAtomics()
 	empty := q.head == nil
-	interrupt.Restore(i)
+	unlockAtomics(mask)
 	return empty
 }
 
@@ -75,24 +75,24 @@ type Stack struct {
 
 // Push a task onto the stack.
 func (s *Stack) Push(t *Task) {
-	i := interrupt.Disable()
+	mask := lockAtomics()
 	if asserts && t.Next != nil {
-		interrupt.Restore(i)
+		unlockAtomics(mask)
 		panic("runtime: pushing a task to a stack with a non-nil Next pointer")
 	}
 	s.top, t.Next = t, s.top
-	interrupt.Restore(i)
+	unlockAtomics(mask)
 }
 
 // Pop a task off of the stack.
 func (s *Stack) Pop() *Task {
-	i := interrupt.Disable()
+	mask := lockAtomics()
 	t := s.top
 	if t != nil {
 		s.top = t.Next
 		t.Next = nil
 	}
-	interrupt.Restore(i)
+	unlockAtomics(mask)
 	return t
 }
 
@@ -112,13 +112,26 @@ func (t *Task) tail() *Task {
 // Queue moves the contents of the stack into a queue.
 // Elements can be popped from the queue in the same order that they would be popped from the stack.
 func (s *Stack) Queue() Queue {
-	i := interrupt.Disable()
+	mask := lockAtomics()
 	head := s.top
 	s.top = nil
 	q := Queue{
 		head: head,
 		tail: head.tail(),
 	}
-	interrupt.Restore(i)
+	unlockAtomics(mask)
 	return q
 }
+
+// Use runtime.lockAtomics and runtime.unlockAtomics so that Queue and Stack
+// work correctly even on multicore systems. These functions are normally used
+// to implement atomic operations, but the same spinlock can also be used for
+// Queue/Stack operations which are very fast.
+// These functions are just plain old interrupt disable/restore on non-multicore
+// systems.
+
+//go:linkname lockAtomics runtime.lockAtomics
+func lockAtomics() interrupt.State
+
+//go:linkname unlockAtomics runtime.unlockAtomics
+func unlockAtomics(mask interrupt.State)
diff --git a/src/internal/task/task.go b/src/internal/task/task.go
index 58c02fe846..e257e1bc8e 100644
--- a/src/internal/task/task.go
+++ b/src/internal/task/task.go
@@ -24,11 +24,28 @@ type Task struct {
 	// This is needed for some crypto packages.
 	FipsIndicator uint8
 
+	// State of the goroutine: running, paused, or must-resume-next-pause.
+	// This extra field doesn't increase memory usage on 32-bit CPUs and above,
+	// since it falls into the padding of the FipsIndicator bit above.
+	RunState uint8
+
 	// DeferFrame stores a pointer to the (stack allocated) defer frame of the
 	// goroutine that is used for the recover builtin.
 	DeferFrame unsafe.Pointer
 }
 
+const (
+	// Initial state: the goroutine state is saved on the stack.
+	RunStatePaused = iota
+
+	// The goroutine is running right now.
+	RunStateRunning
+
+	// The goroutine is running, but already marked as "can resume".
+	// The next call to Pause() won't actually pause the goroutine.
+	RunStateResuming
+)
+
 // DataUint32 returns the Data field as a uint32. The value is only valid after
 // setting it through SetDataUint32 or by storing to it using DataAtomicUint32.
 func (t *Task) DataUint32() uint32 {
diff --git a/src/internal/task/task_stack.go b/src/internal/task/task_stack.go
index 74a0a8c7cc..b6c4a5df93 100644
--- a/src/internal/task/task_stack.go
+++ b/src/internal/task/task_stack.go
@@ -1,9 +1,8 @@
-//go:build scheduler.tasks
+//go:build scheduler.tasks || scheduler.cores
 
 package task
 
 import (
-	"runtime/interrupt"
 	"unsafe"
 )
 
@@ -32,44 +31,12 @@ type state struct {
 	canaryPtr *uintptr
 }
 
-// currentTask is the current running task, or nil if currently in the scheduler.
-var currentTask *Task
-
-// Current returns the current active task.
-func Current() *Task {
-	return currentTask
-}
-
-// Pause suspends the current task and returns to the scheduler.
-// This function may only be called when running on a goroutine stack, not when running on the system stack or in an interrupt.
-func Pause() {
-	// Check whether the canary (the lowest address of the stack) is still
-	// valid. If it is not, a stack overflow has occurred.
-	if *currentTask.state.canaryPtr != stackCanary {
-		runtimePanic("goroutine stack overflow")
-	}
-	if interrupt.In() {
-		runtimePanic("blocked inside interrupt")
-	}
-	currentTask.state.pause()
-}
-
 //export tinygo_task_exit
 func taskExit() {
 	// TODO: explicitly free the stack after switching back to the scheduler.
 	Pause()
 }
 
-// Resume the task until it pauses or completes.
-// This may only be called from the scheduler.
-func (t *Task) Resume() {
-	currentTask = t
-	t.gcData.swap()
-	t.state.resume()
-	t.gcData.swap()
-	currentTask = nil
-}
-
 // initialize the state and prepare to call the specified function with the specified argument bundle.
 func (s *state) initialize(fn uintptr, args unsafe.Pointer, stackSize uintptr) {
 	// Create a stack.
diff --git a/src/internal/task/task_stack_multicore.go b/src/internal/task/task_stack_multicore.go
new file mode 100644
index 0000000000..65cf3a004c
--- /dev/null
+++ b/src/internal/task/task_stack_multicore.go
@@ -0,0 +1,53 @@
+//go:build scheduler.cores
+
+package task
+
+import "runtime/interrupt"
+
+// Current returns the current active task.
+//
+//go:linkname Current runtime.currentTask
+func Current() *Task
+
+// Pause suspends the current task and returns to the scheduler.
+// This function may only be called when running on a goroutine stack, not when running on the system stack or in an interrupt.
+func Pause() {
+	lockScheduler()
+	PauseLocked()
+}
+
+// PauseLocked is the same as Pause, but must be called with the scheduler lock
+// already taken.
+func PauseLocked() {
+	// Check whether the canary (the lowest address of the stack) is still
+	// valid. If it is not, a stack overflow has occurred.
+	current := Current()
+	if *current.state.canaryPtr != stackCanary {
+		runtimePanic("goroutine stack overflow")
+	}
+	if interrupt.In() {
+		runtimePanic("blocked inside interrupt")
+	}
+	if current.RunState == RunStateResuming {
+		// Another core already marked this goroutine as ready to resume.
+		current.RunState = RunStateRunning
+		unlockScheduler()
+		return
+	}
+	current.RunState = RunStatePaused
+	current.state.pause()
+}
+
+// Resume the task until it pauses or completes.
+// This may only be called from the scheduler.
+func (t *Task) Resume() {
+	t.gcData.swap()
+	t.state.resume()
+	t.gcData.swap()
+}
+
+//go:linkname lockScheduler runtime.lockScheduler
+func lockScheduler()
+
+//go:linkname unlockScheduler runtime.unlockScheduler
+func unlockScheduler()
diff --git a/src/internal/task/task_stack_tinygoriscv.go b/src/internal/task/task_stack_tinygoriscv.go
index edf1215a08..541dc96a4c 100644
--- a/src/internal/task/task_stack_tinygoriscv.go
+++ b/src/internal/task/task_stack_tinygoriscv.go
@@ -1,10 +1,16 @@
-//go:build scheduler.tasks && tinygo.riscv
+//go:build (scheduler.tasks || scheduler.cores) && tinygo.riscv
 
 package task
 
 import "unsafe"
 
-var systemStack uintptr
+// Returns a pointer where the system stack can be stored.
+// This is a layering violation! We should probably refactor this so that we
+// don't need such gymnastics to store the system stack pointer. (It should
+// probably be moved to the runtime).
+//
+//go:linkname runtime_systemStackPtr runtime.systemStackPtr
+func runtime_systemStackPtr() *uintptr
 
 // calleeSavedRegs is the list of registers that must be saved and restored when
 // switching between tasks. Also see scheduler_riscv.S that relies on the
@@ -50,17 +56,18 @@ func (s *state) archInit(r *calleeSavedRegs, fn uintptr, args unsafe.Pointer) {
 }
 
 func (s *state) resume() {
-	swapTask(s.sp, &systemStack)
+	swapTask(s.sp, runtime_systemStackPtr())
 }
 
 func (s *state) pause() {
-	newStack := systemStack
-	systemStack = 0
+	systemStackPtr := runtime_systemStackPtr()
+	newStack := *systemStackPtr
+	*systemStackPtr = 0
 	swapTask(newStack, &s.sp)
 }
 
 // SystemStack returns the system stack pointer when called from a task stack.
 // When called from the system stack, it returns 0.
 func SystemStack() uintptr {
-	return systemStack
+	return *runtime_systemStackPtr()
 }
diff --git a/src/internal/task/task_stack_unicore.go b/src/internal/task/task_stack_unicore.go
new file mode 100644
index 0000000000..b4425de38f
--- /dev/null
+++ b/src/internal/task/task_stack_unicore.go
@@ -0,0 +1,37 @@
+//go:build scheduler.tasks
+
+package task
+
+import "runtime/interrupt"
+
+// currentTask is the current running task, or nil if currently in the scheduler.
+var currentTask *Task
+
+// Current returns the current active task.
+func Current() *Task {
+	return currentTask
+}
+
+// Pause suspends the current task and returns to the scheduler.
+// This function may only be called when running on a goroutine stack, not when running on the system stack or in an interrupt.
+func Pause() {
+	// Check whether the canary (the lowest address of the stack) is still
+	// valid. If it is not, a stack overflow has occurred.
+	if *currentTask.state.canaryPtr != stackCanary {
+		runtimePanic("goroutine stack overflow")
+	}
+	if interrupt.In() {
+		runtimePanic("blocked inside interrupt")
+	}
+	currentTask.state.pause()
+}
+
+// Resume the task until it pauses or completes.
+// This may only be called from the scheduler.
+func (t *Task) Resume() {
+	currentTask = t
+	t.gcData.swap()
+	t.state.resume()
+	t.gcData.swap()
+	currentTask = nil
+}
diff --git a/src/runtime/atomics_critical.go b/src/runtime/atomics_critical.go
index 2d98881a10..74ce321f10 100644
--- a/src/runtime/atomics_critical.go
+++ b/src/runtime/atomics_critical.go
@@ -6,7 +6,6 @@
 package runtime
 
 import (
-	"runtime/interrupt"
 	_ "unsafe"
 )
 
@@ -23,27 +22,27 @@ import (
 func __atomic_load_2(ptr *uint16, ordering uintptr) uint16 {
 	// The LLVM docs for this say that there is a val argument after the pointer.
 	// That is a typo, and the GCC docs omit it.
-	mask := interrupt.Disable()
+	mask := lockAtomics()
 	val := *ptr
-	interrupt.Restore(mask)
+	unlockAtomics(mask)
 	return val
 }
 
 //export __atomic_store_2
 func __atomic_store_2(ptr *uint16, val uint16, ordering uintptr) {
-	mask := interrupt.Disable()
+	mask := lockAtomics()
 	*ptr = val
-	interrupt.Restore(mask)
+	unlockAtomics(mask)
 }
 
 //go:inline
 func doAtomicCAS16(ptr *uint16, expected, desired uint16) uint16 {
-	mask := interrupt.Disable()
+	mask := lockAtomics()
 	old := *ptr
 	if old == expected {
 		*ptr = desired
 	}
-	interrupt.Restore(mask)
+	unlockAtomics(mask)
 	return old
 }
 
@@ -61,10 +60,10 @@ func __atomic_compare_exchange_2(ptr, expected *uint16, desired uint16, successO
 
 //go:inline
 func doAtomicSwap16(ptr *uint16, new uint16) uint16 {
-	mask := interrupt.Disable()
+	mask := lockAtomics()
 	old := *ptr
 	*ptr = new
-	interrupt.Restore(mask)
+	unlockAtomics(mask)
 	return old
 }
 
@@ -80,11 +79,11 @@ func __atomic_exchange_2(ptr *uint16, new uint16, ordering uintptr) uint16 {
 
 //go:inline
 func doAtomicAdd16(ptr *uint16, value uint16) (old, new uint16) {
-	mask := interrupt.Disable()
+	mask := lockAtomics()
 	old = *ptr
 	new = old + value
 	*ptr = new
-	interrupt.Restore(mask)
+	unlockAtomics(mask)
 	return old, new
 }
 
@@ -112,27 +111,27 @@ func __atomic_add_fetch_2(ptr *uint16, value uint16, ordering uintptr) uint16 {
 func __atomic_load_4(ptr *uint32, ordering uintptr) uint32 {
 	// The LLVM docs for this say that there is a val argument after the pointer.
 	// That is a typo, and the GCC docs omit it.
-	mask := interrupt.Disable()
+	mask := lockAtomics()
 	val := *ptr
-	interrupt.Restore(mask)
+	unlockAtomics(mask)
 	return val
 }
 
 //export __atomic_store_4
 func __atomic_store_4(ptr *uint32, val uint32, ordering uintptr) {
-	mask := interrupt.Disable()
+	mask := lockAtomics()
 	*ptr = val
-	interrupt.Restore(mask)
+	unlockAtomics(mask)
 }
 
 //go:inline
 func doAtomicCAS32(ptr *uint32, expected, desired uint32) uint32 {
-	mask := interrupt.Disable()
+	mask := lockAtomics()
 	old := *ptr
 	if old == expected {
 		*ptr = desired
 	}
-	interrupt.Restore(mask)
+	unlockAtomics(mask)
 	return old
 }
 
@@ -150,10 +149,10 @@ func __atomic_compare_exchange_4(ptr, expected *uint32, desired uint32, successO
 
 //go:inline
 func doAtomicSwap32(ptr *uint32, new uint32) uint32 {
-	mask := interrupt.Disable()
+	mask := lockAtomics()
 	old := *ptr
 	*ptr = new
-	interrupt.Restore(mask)
+	unlockAtomics(mask)
 	return old
 }
 
@@ -169,11 +168,11 @@ func __atomic_exchange_4(ptr *uint32, new uint32, ordering uintptr) uint32 {
 
 //go:inline
 func doAtomicAdd32(ptr *uint32, value uint32) (old, new uint32) {
-	mask := interrupt.Disable()
+	mask := lockAtomics()
 	old = *ptr
 	new = old + value
 	*ptr = new
-	interrupt.Restore(mask)
+	unlockAtomics(mask)
 	return old, new
 }
 
@@ -201,27 +200,27 @@ func __atomic_add_fetch_4(ptr *uint32, value uint32, ordering uintptr) uint32 {
 func __atomic_load_8(ptr *uint64, ordering uintptr) uint64 {
 	// The LLVM docs for this say that there is a val argument after the pointer.
 	// That is a typo, and the GCC docs omit it.
-	mask := interrupt.Disable()
+	mask := lockAtomics()
 	val := *ptr
-	interrupt.Restore(mask)
+	unlockAtomics(mask)
 	return val
 }
 
 //export __atomic_store_8
 func __atomic_store_8(ptr *uint64, val uint64, ordering uintptr) {
-	mask := interrupt.Disable()
+	mask := lockAtomics()
 	*ptr = val
-	interrupt.Restore(mask)
+	unlockAtomics(mask)
 }
 
 //go:inline
 func doAtomicCAS64(ptr *uint64, expected, desired uint64) uint64 {
-	mask := interrupt.Disable()
+	mask := lockAtomics()
 	old := *ptr
 	if old == expected {
 		*ptr = desired
 	}
-	interrupt.Restore(mask)
+	unlockAtomics(mask)
 	return old
 }
 
@@ -239,10 +238,10 @@ func __atomic_compare_exchange_8(ptr, expected *uint64, desired uint64, successO
 
 //go:inline
 func doAtomicSwap64(ptr *uint64, new uint64) uint64 {
-	mask := interrupt.Disable()
+	mask := lockAtomics()
 	old := *ptr
 	*ptr = new
-	interrupt.Restore(mask)
+	unlockAtomics(mask)
 	return old
 }
 
@@ -258,11 +257,11 @@ func __atomic_exchange_8(ptr *uint64, new uint64, ordering uintptr) uint64 {
 
 //go:inline
 func doAtomicAdd64(ptr *uint64, value uint64) (old, new uint64) {
-	mask := interrupt.Disable()
+	mask := lockAtomics()
 	old = *ptr
 	new = old + value
 	*ptr = new
-	interrupt.Restore(mask)
+	unlockAtomics(mask)
 	return old, new
 }
 
diff --git a/src/runtime/gc_stack_cores.go b/src/runtime/gc_stack_cores.go
new file mode 100644
index 0000000000..32c7e68232
--- /dev/null
+++ b/src/runtime/gc_stack_cores.go
@@ -0,0 +1,101 @@
+//go:build scheduler.cores
+
+package runtime
+
+import (
+	"internal/task"
+	"sync/atomic"
+)
+
+// Normally 0. During a GC scan it has various purposes for signalling between
+// the core running the GC and the other cores in the system.
+var gcScanState atomic.Uint32
+
+// Start GC scan by pausing the world (all other cores) and scanning their
+// stacks. It doesn't resume the world.
+func gcMarkReachable() {
+	core := currentCPU()
+
+	// Interrupt all other cores.
+	gcScanState.Store(1)
+	for i := uint32(0); i < numCPU; i++ {
+		if i == core {
+			continue
+		}
+		gcPauseCore(i)
+	}
+
+	// Scan the stack(s) of the current core.
+	scanCurrentStack()
+	if !task.OnSystemStack() {
+		// Mark system stack.
+		markRoots(task.SystemStack(), coreStackTop(core))
+	}
+
+	// Scan globals.
+	findGlobals(markRoots)
+
+	// Busy-wait until all the other cores are ready. They certainly should be,
+	// after the scanning we did above.
+	for gcScanState.Load() != numCPU {
+		spinLoopHint()
+	}
+	gcScanState.Store(0)
+
+	// Signal each core in turn that they can scan the stack.
+	for i := uint32(0); i < numCPU; i++ {
+		if i == core {
+			continue
+		}
+
+		// Wake up the core to scan the stack.
+		gcSignalCore(i)
+
+		// Busy-wait until this core finished scanning.
+		for gcScanState.Load() == 0 {
+			spinLoopHint()
+		}
+		gcScanState.Store(0)
+	}
+
+	// All the stack are now scanned.
+}
+
+//go:export tinygo_scanCurrentStack
+func scanCurrentStack()
+
+//go:export tinygo_scanstack
+func scanstack(sp uintptr) {
+	// Mark the current stack.
+	// This function is called by scanCurrentStack, after pushing all registers
+	// onto the stack.
+	if task.OnSystemStack() {
+		// This is the system stack.
+		// Scan all words on the stack.
+		markRoots(sp, coreStackTop(currentCPU()))
+	} else {
+		// This is a goroutine stack.
+		markCurrentGoroutineStack(sp)
+	}
+}
+
+// Resume the world after a call to gcMarkReachable.
+func gcResumeWorld() {
+	// Signal each core that they can resume.
+	hartID := currentCPU()
+	for i := uint32(0); i < numCPU; i++ {
+		if i == hartID {
+			continue
+		}
+
+		// Signal the core.
+		gcSignalCore(i)
+	}
+
+	// Busy-wait until the core acknowledges the signal (and is going to return
+	// from the interrupt handler).
+	for gcScanState.Load() != numCPU-1 {
+		spinLoopHint()
+	}
+	gcScanState.Store(0)
+}
diff --git a/src/runtime/gc_stack_raw.go b/src/runtime/gc_stack_raw.go
index 5c302b1f11..03c37696a9 100644
--- a/src/runtime/gc_stack_raw.go
+++ b/src/runtime/gc_stack_raw.go
@@ -1,8 +1,14 @@
-//go:build (gc.conservative || gc.precise || gc.boehm) && !tinygo.wasm && !scheduler.threads
+//go:build (gc.conservative || gc.precise || gc.boehm) && !tinygo.wasm && !scheduler.threads && !scheduler.cores
 
 package runtime
 
-import "internal/task"
+import (
+	"internal/task"
+	"sync/atomic"
+)
+
+// Unused.
+var gcScanState atomic.Uint32
 
 func gcMarkReachable() {
 	markStack()
diff --git a/src/runtime/panic.go b/src/runtime/panic.go
index 9ae1f982b9..e18a306349 100644
--- a/src/runtime/panic.go
+++ b/src/runtime/panic.go
@@ -99,7 +99,8 @@ func runtimePanicAt(addr unsafe.Pointer, msg string) {
 	} else {
 		printstring("panic: runtime error: ")
 	}
-	println(msg)
+	printstring(msg)
+	printnl()
 	abort()
 }
 
diff --git a/src/runtime/print.go b/src/runtime/print.go
index a4de460253..a5fba0c8d0 100644
--- a/src/runtime/print.go
+++ b/src/runtime/print.go
@@ -1,7 +1,6 @@
 package runtime
 
 import (
-	"internal/task"
 	"unsafe"
 )
 
@@ -9,18 +8,6 @@ type stringer interface {
 	String() string
 }
 
-// Lock to make sure print calls do not interleave.
-// This is a no-op lock on systems that do not have parallelism.
-var printLock task.PMutex
-
-func printlock() {
-	printLock.Lock()
-}
-
-func printunlock() {
-	printLock.Unlock()
-}
-
 //go:nobounds
 func printstring(s string) {
 	for i := 0; i < len(s); i++ {
diff --git a/src/runtime/runtime_tinygoriscv_qemu.go b/src/runtime/runtime_tinygoriscv_qemu.go
index a77ad71f55..09d67b6243 100644
--- a/src/runtime/runtime_tinygoriscv_qemu.go
+++ b/src/runtime/runtime_tinygoriscv_qemu.go
@@ -4,26 +4,82 @@ package runtime
 
 import (
 	"device/riscv"
+	"internal/task"
+	"math/bits"
+	"runtime/interrupt"
 	"runtime/volatile"
+	"sync/atomic"
 	"unsafe"
 )
 
 // This file implements the VirtIO RISC-V interface implemented in QEMU, which
 // is an interface designed for emulation.
 
+const numCPU = 4
+
 //export main
 func main() {
-	preinit()
-
 	// Set the interrupt address.
 	// Note that this address must be aligned specially, otherwise the MODE bits
 	// of MTVEC won't be zero.
 	riscv.MTVEC.Set(uintptr(unsafe.Pointer(&handleInterruptASM)))
 
+	// Enable software interrupts. We'll need them to wake up other cores.
+	riscv.MIE.SetBits(riscv.MIE_MSIE)
+
+	// If we're not hart 0, wait until we get the signal everything has been set
+	// up.
+	if hartID := riscv.MHARTID.Get(); hartID != 0 {
+		// Wait until we get the signal this hart is ready to start.
+		// Note that interrupts are disabled, which means that the interrupt
+		// isn't actually taken. But we can still wait for it using wfi.
+		// If the cores scheduler is not used, we'll stay in this state forever.
+		for riscv.MIP.Get()&riscv.MIP_MSIP == 0 {
+			riscv.Asm("wfi")
+		}
+
+		// Clear the software interrupt.
+		aclintMSWI.MSIP[hartID].Set(0)
+
+		// Now that we've cleared the software interrupt, we can enable
+		// interrupts as was already done on hart 0.
+		riscv.MSTATUS.SetBits(riscv.MSTATUS_MIE)
+
+		// Also enable timer interrupts, for sleepTicksMulticore.
+		riscv.MIE.SetBits(riscv.MIE_MTIE)
+
+		// Now start running the scheduler on this core.
+		schedulerLock.Lock()
+		scheduler(false)
+
+		// The scheduler exited, which means main returned and the program
+		// should exit immediately.
+		// Signal hart 0 to exit.
+		exitCodePlusOne.Store(0 + 1) // exit code 0
+		aclintMSWI.MSIP[0].Set(1)
+
+		// Unlock the scheduler to be sure. Shouldn't be needed.
+		schedulerLock.Unlock()
+
+		// Wait until hart 0 actually exits.
+		for {
+			riscv.Asm("wfi")
+		}
+	}
+
 	// Enable global interrupts now that they've been set up.
 	// This is currently only for timer interrupts.
 	riscv.MSTATUS.SetBits(riscv.MSTATUS_MIE)
 
+	// Set all MTIMECMP registers to a value that clears the MTIP bit in MIP.
+	// If we don't do this, the wfi instruction won't work as expected.
+	for i := 0; i < numCPU; i++ {
+		aclintMTIMECMP[i].Set(0xffff_ffff_ffff_ffff)
+	}
+
+	// Enable timer interrupts on hart 0.
+	riscv.MIE.SetBits(riscv.MIE_MTIE)
+
 	run()
 	exit(0)
 }
@@ -37,13 +93,32 @@ func handleInterrupt() {
 	code := uint(cause &^ (1 << 31))
 	if cause&(1<<31) != 0 {
 		// Topmost bit is set, which means that it is an interrupt.
+		hartID := currentCPU()
 		switch code {
+		case riscv.MachineSoftwareInterrupt:
+			if exitCodePlusOne.Load() != 0 {
+				exitNow(exitCodePlusOne.Load() - 1)
+			}
+			if gcScanState.Load() != 0 {
+				// The GC needs to run.
+				gcInterruptHandler(hartID)
+			}
+			checkpoint := &schedulerWaitCheckpoints[hartID]
+			if checkpoint.Saved() {
+				aclintMSWI.MSIP[hartID].Set(0)
+				riscv.MCAUSE.Set(0)
+				checkpoint.Jump()
+			}
 		case riscv.MachineTimerInterrupt:
-			// Signal timeout.
-			timerWakeup.Set(1)
-			// Disable the timer, to avoid triggering the interrupt right after
-			// this interrupt returns.
-			riscv.MIE.ClearBits(riscv.MIE_MTIE)
+			if sleepCheckpoint.Saved() {
+				// Set MTIMECMP to a high value so that MTIP goes low.
+				aclintMTIMECMP[hartID].Set(0xffff_ffff_ffff_ffff)
+				riscv.MCAUSE.Set(0)
+				sleepCheckpoint.Jump()
+			}
+		default:
+			runtimePanic("unknown interrupt")
+			abort()
 		}
 	} else {
 		// Topmost bit is clear, so it is an exception of some sort.
@@ -57,6 +132,79 @@ func handleInterrupt() {
 	riscv.MCAUSE.Set(0)
 }
 
+// The GC interrupted this core for the stop-the-world phase.
+// This function handles that, and only returns after the stop-the-world phase
+// ended.
+func gcInterruptHandler(hartID uint32) {
+	// *only* enable the MSIE interrupt
+	savedMIE := riscv.MIE.Get()
+	riscv.MIE.Set(riscv.MIE_MSIE)
+
+	// Disable this interrupt (to be enabled again soon).
+	aclintMSWI.MSIP[hartID].Set(0)
+
+	// Let the GC know we're ready.
+	gcScanState.Add(1)
+
+	// Wait until we get a signal to start scanning.
+	for riscv.MIP.Get()&riscv.MIP_MSIP == 0 {
+		riscv.Asm("wfi")
+	}
+	aclintMSWI.MSIP[hartID].Set(0)
+
+	// Scan the stack(s) of this core.
+	scanCurrentStack()
+	if !task.OnSystemStack() {
+		// Mark system stack.
+		markRoots(task.SystemStack(), coreStackTop(hartID))
+	}
+
+	// Signal we've finished scanning.
+	gcScanState.Store(1)
+
+	// Wait until we get a signal that the stop-the-world phase has ended.
+	for riscv.MIP.Get()&riscv.MIP_MSIP == 0 {
+		riscv.Asm("wfi")
+	}
+	aclintMSWI.MSIP[hartID].Set(0)
+
+	// Restore MIE bits.
+	riscv.MIE.Set(savedMIE)
+
+	// Signal we received the signal and are going to exit the interrupt.
+	gcScanState.Add(1)
+}
+
+//go:extern _stack_top
+var stack0TopSymbol [0]byte
+
+//go:extern _stack1_top
+var stack1TopSymbol [0]byte
+
+//go:extern _stack2_top
+var stack2TopSymbol [0]byte
+
+//go:extern _stack3_top
+var stack3TopSymbol [0]byte
+
+// Returns the stack top (highest address) of the system stack of the given
+// core.
+func coreStackTop(core uint32) uintptr {
+	switch core {
+	case 0:
+		return uintptr(unsafe.Pointer(&stack0TopSymbol))
+	case 1:
+		return uintptr(unsafe.Pointer(&stack1TopSymbol))
+	case 2:
+		return uintptr(unsafe.Pointer(&stack2TopSymbol))
+	case 3:
+		return uintptr(unsafe.Pointer(&stack3TopSymbol))
+	default:
+		runtimePanic("unexpected core")
+		return 0
+	}
+}
+
 // One tick is 100ns by default in QEMU.
 // (This is not a standard, just the default used by QEMU).
 func ticksToNanoseconds(ticks timeUnit) int64 {
@@ -67,22 +215,79 @@ func nanosecondsToTicks(ns int64) timeUnit {
 	return timeUnit(ns / 100) // one tick is 100ns
 }
 
-var timerWakeup volatile.Register8
+var sleepCheckpoint interrupt.Checkpoint
 
 func sleepTicks(d timeUnit) {
-	// Enable the timer.
-	target := uint64(ticks() + d)
-	aclintMTIMECMP.Set(target)
-	riscv.MIE.SetBits(riscv.MIE_MTIE)
+	hartID := currentCPU()
+	if sleepCheckpoint.Save() {
+		// Configure timeout.
+		target := uint64(ticks() + d)
+		aclintMTIMECMP[hartID].Set(target)
 
-	// Wait until it fires.
-	for {
-		if timerWakeup.Get() != 0 {
-			timerWakeup.Set(0)
-			// Disable timer.
-			break
+		// Wait for the interrupt to happen.
+		for {
+			riscv.Asm("wfi")
+		}
+	}
+
+	// We got awoken.
+}
+
+// Currently sleeping core, or 0xff.
+// Must only be accessed with the scheduler lock held.
+var sleepingCore uint8 = 0xff
+
+// Return whether another core is sleeping.
+// May only be called with the scheduler lock held.
+func hasSleepingCore() bool {
+	return sleepingCore != 0xff
+}
+
+// Almost identical to sleepTicks, except that it will unlock/lock the scheduler
+// while sleeping and is interruptible by interruptSleepTicksMulticore.
+// This may only be called with the scheduler lock held.
+func sleepTicksMulticore(d timeUnit) {
+	// Disable interrupts while configuring sleep.
+	// This is needed because unlocking the scheduler and setting the timer
+	// interrupt need to happen atomically.
+	riscv.MSTATUS.ClearBits(riscv.MSTATUS_MIE)
+
+	hartID := currentCPU()
+	if sleepCheckpoint.Save() {
+		sleepingCore = uint8(hartID)
+
+		// Configure timeout.
+		target := uint64(ticks() + d)
+		aclintMTIMECMP[hartID].Set(target)
+
+		// Unlock, now that the timeout has been set (so that
+		// interruptSleepTicksMulticore will see the correct wakeup time).
+		schedulerLock.Unlock()
+
+		// Sleep has been configured, interrupts may happen again.
+		riscv.MSTATUS.SetBits(riscv.MSTATUS_MIE)
+
+		// Wait for the interrupt to happen.
+		for {
+			riscv.Asm("wfi")
+		}
+	}
+	// We got awoken.
+
+	// Lock again, after we finished sleeping.
+	schedulerLock.Lock()
+	sleepingCore = 0xff
+}
+
+// Interrupt an ongoing call to sleepTicksMulticore on another core.
+// This may only be called with the scheduler lock held.
+func interruptSleepTicksMulticore(wakeup timeUnit) {
+	if sleepingCore != 0xff {
+		// Immediately exit the sleep.
+		old := aclintMTIMECMP[sleepingCore].Get()
+		if uint64(wakeup) < old {
+			aclintMTIMECMP[sleepingCore].Set(uint64(wakeup))
 		}
-		riscv.Asm("wfi")
 	}
 }
 
@@ -98,7 +303,7 @@ func ticks() timeUnit {
 			return timeUnit(lowBits) | (timeUnit(highBits) << 32)
 		}
 		// Retry, because there was a rollover in the low bits (happening every
-		// 429 days).
+		// ~7 days).
 		highBits = newHighBits
 	}
 }
@@ -120,7 +325,10 @@ var (
 		low  volatile.Register32
 		high volatile.Register32
 	})(unsafe.Pointer(uintptr(0x0200_bff8)))
-	aclintMTIMECMP = (*volatile.Register64)(unsafe.Pointer(uintptr(0x0200_4000)))
+	aclintMTIMECMP = (*[4095]volatile.Register64)(unsafe.Pointer(uintptr(0x0200_4000)))
+	aclintMSWI     = (*struct {
+		MSIP [4095]volatile.Register32
+	})(unsafe.Pointer(uintptr(0x0200_0000)))
 )
 
 func putchar(c byte) {
@@ -137,17 +345,166 @@ func buffered() int {
 	return 0
 }
 
+// Define the various spinlocks needed by the runtime.
+var (
+	schedulerLock spinLock
+	futexLock     spinLock
+	atomicsLock   spinLock
+	printLock     spinLock
+)
+
+type spinLock struct {
+	atomic.Uint32
+}
+
+func (l *spinLock) Lock() {
+	// Try to replace 0 with 1. Once we succeed, the lock has been acquired.
+	for !l.Uint32.CompareAndSwap(0, 1) {
+		spinLoopHint()
+	}
+}
+
+func (l *spinLock) Unlock() {
+	// Safety check: the spinlock should have been locked.
+	if schedulerAsserts && l.Uint32.Load() != 1 {
+		runtimePanic("unlock of unlocked spinlock")
+	}
+
+	// Unlock the lock. Simply write 0, because we already know it is locked.
+	l.Uint32.Store(0)
+}
+
+// Hint to the CPU that this core is just waiting, and the core can go into a
+// lower energy state.
+func spinLoopHint() {
+	// This is a no-op in QEMU TCG (but added here for completeness):
+	// https://github.com/qemu/qemu/blob/v9.2.3/target/riscv/insn_trans/trans_rvi.c.inc#L856
+	riscv.Asm("pause")
+}
+
+func currentCPU() uint32 {
+	return uint32(riscv.MHARTID.Get())
+}
+
+func startSecondaryCores() {
+	// Start all the other cores besides hart 0.
+	for hart := 1; hart < numCPU; hart++ {
+		// Signal the given hart it is ready to start using a software
+		// interrupt.
+		aclintMSWI.MSIP[hart].Set(1)
+	}
+}
+
+// Bitset of harts that are currently sleeping in schedulerUnlockAndWait.
+// This supports up to 8 harts.
+// This variable may only be accessed with the scheduler lock held.
+var sleepingHarts uint8
+
+// Checkpoints for cores waiting for runnable tasks.
+var schedulerWaitCheckpoints [numCPU]interrupt.Checkpoint
+
+// Put the scheduler to sleep, since there are no tasks to run.
+// This will unlock the scheduler lock, and must be called with the scheduler
+// lock held.
+func schedulerUnlockAndWait() {
+	hartID := currentCPU()
+
+	// Mark the current hart as sleeping.
+	sleepingHarts |= uint8(1 << hartID)
+
+	// If this is the last core awake and is going to sleep, the scheduler is
+	// deadlocked.
+	// We can do this check since this is not baremetal: there won't be any
+	// external interrupts that might unblock a goroutine.
+	if sleepingHarts == (1<<numCPU)-1 {
+		runtimePanic("all cores are sleeping - deadlock!")
+	}
+
+	// Need to disable interrupts while saving the checkpoint, otherwise if the
+	// software interrupt happens earlier for another reason (e.g. a GC cycle)
+	// it will see an incomplete checkpoint and the schedulerLock might not be
+	// unlocked yet. That will lead to an invalid state.
+	riscv.MSTATUS.ClearBits(riscv.MSTATUS_MIE)
+	if schedulerWaitCheckpoints[hartID].Save() {
+		schedulerLock.Unlock()
+		riscv.MSTATUS.SetBits(riscv.MSTATUS_MIE)
+
+		// Wait until we get awoken :)
+		for {
+			riscv.Asm("wfi")
+		}
+	}
+
+	// We got awoken again. We need to lock the scheduler again before
+	// returning.
+	schedulerLock.Lock()
+}
+
+// Wake another core, if one is sleeping. Must be called with the scheduler lock
+// held.
+func schedulerWake() {
+	// Look up the lowest-numbered hart that is sleeping.
+	// Returns 8 if there are no sleeping harts.
+	hart := bits.TrailingZeros8(sleepingHarts)
+
+	if hart < 8 {
+		// There is a sleeping hart. Wake it.
+		sleepingHarts &^= 1 << hart  // clear the bit
+		aclintMSWI.MSIP[hart].Set(1) // send software interrupt
+	}
+}
+
+// Pause the given core by sending it an interrupt.
+func gcPauseCore(core uint32) {
+	aclintMSWI.MSIP[core].Set(1) // send software interrupt
+}
+
+// Signal the given core that it can resume one step.
+// This is called twice after gcPauseCore: the first time to scan the stack of
+// the core, and the second time to end the stop-the-world phase.
+func gcSignalCore(core uint32) {
+	aclintMSWI.MSIP[core].Set(1) // send software interrupt
+}
+
 func abort() {
 	exit(1)
 }
 
+// Zero in the default state, when non-zero it indicates the exit code plus one.
+// So exit(0) will result in 1, exit(1) in 2, etc.
+var exitCodePlusOne atomic.Uint32
+
 func exit(code int) {
+	// Check for invalid values, to be sure.
+	if code < 0 {
+		code = 255
+	}
+
+	// If we're not on hart 0, we can't exit QEMU.
+	// Therefore, send an interrupt to hart 0 instead to request an exit.
+	if currentCPU() != 0 {
+		// Signal hart 0 to exit.
+		exitCodePlusOne.Store(uint32(code) + 1)
+		aclintMSWI.MSIP[0].Set(1)
+
+		// Wait for the interrupt to happen. This should happen immediately.
+		for {
+			riscv.Asm("wfi")
+		}
+	}
+
+	exitNow(uint32(code))
+}
+
+// Send an exit signal to the test finisher pseudo-device, without checking
+// whether we are on hart 0.
+func exitNow(code uint32) {
 	// Make sure the QEMU process exits.
 	if code == 0 {
 		testFinisher.Set(0x5555) // FINISHER_PASS
 	} else {
 		// Exit code is stored in the upper 16 bits of the 32 bit value.
-		testFinisher.Set(uint32(code)<<16 | 0x3333) // FINISHER_FAIL
+		testFinisher.Set(code<<16 | 0x3333) // FINISHER_FAIL
 	}
 
 	// Lock up forever (as a fallback).
@@ -162,10 +519,6 @@ func exit(code int) {
 func handleException(code uint) {
 	// For a list of exception codes, see:
 	// https://content.riscv.org/wp-content/uploads/2019/08/riscv-privileged-20190608-1.pdf#page=49
-	print("fatal error: exception with mcause=")
-	print(code)
-	print(" pc=")
-	print(riscv.MEPC.Get())
-	println()
+	print("fatal error: exception with mcause=", code, " pc=", riscv.MEPC.Get(), " hart=", uint(riscv.MHARTID.Get()), "\r\n")
 	abort()
 }
diff --git a/src/runtime/scheduler_cooperative.go b/src/runtime/scheduler_cooperative.go
index 5f569c6e14..bf6f5aec49 100644
--- a/src/runtime/scheduler_cooperative.go
+++ b/src/runtime/scheduler_cooperative.go
@@ -252,3 +252,19 @@ func run() {
 	}()
 	scheduler(false)
 }
+
+func lockAtomics() interrupt.State {
+	return interrupt.Disable()
+}
+
+func unlockAtomics(mask interrupt.State) {
+	interrupt.Restore(mask)
+}
+
+func printlock() {
+	// nothing to do
+}
+
+func printunlock() {
+	// nothing to do
+}
diff --git a/src/runtime/scheduler_cores.go b/src/runtime/scheduler_cores.go
new file mode 100644
index 0000000000..3b673e1e3b
--- /dev/null
+++ b/src/runtime/scheduler_cores.go
@@ -0,0 +1,317 @@
+//go:build scheduler.cores
+
+package runtime
+
+import (
+	"internal/task"
+	"runtime/interrupt"
+	"sync/atomic"
+)
+
+const hasScheduler = true
+
+const hasParallelism = true
+
+var mainExited atomic.Uint32
+
+// Which task is running on a given core (or nil if there is no task running on
+// the core).
+var cpuTasks [numCPU]*task.Task
+
+var (
+	sleepQueue *task.Task
+	runqueue   task.Queue
+)
+
+func deadlock() {
+	// Call yield without requesting a wakeup.
+	task.Pause()
+	trap()
+}
+
+// Mark the given task as ready to resume.
+// This is allowed even if the task isn't paused yet, but will pause soon.
+func scheduleTask(t *task.Task) {
+	schedulerLock.Lock()
+	switch t.RunState {
+	case task.RunStatePaused:
+		// Paused, state is saved on the stack.
+		// Add it to the runqueue...
+		runqueue.Push(t)
+		// ...and wake up a sleeping core, if there is one.
+		// (If all cores are already busy, this is a no-op).
+		schedulerWake()
+	case task.RunStateRunning:
+		// Not yet paused (probably going to pause very soon), so let the
+		// Pause() function know it can resume immediately.
+		t.RunState = task.RunStateResuming
+	default:
+		if schedulerAsserts {
+			runtimePanic("scheduler: unknown run state")
+		}
+	}
+	schedulerLock.Unlock()
+}
+
+func addSleepTask(t *task.Task, wakeup timeUnit) {
+	// Save the timestamp when the task should be woken up.
+	t.Data = uint64(wakeup)
+
+	// If another core is currently using the timer, make sure it wakes up at
+	// the right time.
+	interruptSleepTicksMulticore(wakeup)
+
+	// Find the position where we should insert this task in the queue.
+	q := &sleepQueue
+	for {
+		if *q == nil {
+			// Found the end of the time queue. Insert it here, at the end.
+			break
+		}
+		if timeUnit((*q).Data) > timeUnit(t.Data) {
+			// Found a task in the queue that has a timeout before the
+			// to-be-sleeping task. Insert our task right before.
+			break
+		}
+		q = &(*q).Next
+	}
+
+	// Insert the task into the queue (this could be at the end, if *q is nil).
+	t.Next = *q
+	*q = t
+}
+
+func Gosched() {
+	schedulerLock.Lock()
+	runqueue.Push(task.Current())
+	task.PauseLocked()
+}
+
+func addTimer(tn *timerNode) {
+	schedulerLock.Lock()
+	timerQueueAdd(tn)
+	interruptSleepTicksMulticore(tn.whenTicks())
+	schedulerLock.Unlock()
+}
+
+func removeTimer(t *timer) *timerNode {
+	schedulerLock.Lock()
+	n := timerQueueRemove(t)
+	schedulerLock.Unlock()
+	return n
+}
+
+func schedulerRunQueue() *task.Queue {
+	return &runqueue
+}
+
+// Pause the current task for a given time.
+//
+//go:linkname sleep time.Sleep
+func sleep(duration int64) {
+	if duration <= 0 {
+		return
+	}
+
+	wakeup := ticks() + nanosecondsToTicks(duration)
+
+	// While the scheduler is locked:
+	// - add this task to the sleep queue
+	// - switch to the scheduler (only allowed while locked)
+	// - let the scheduler handle it from there
+	schedulerLock.Lock()
+	addSleepTask(task.Current(), wakeup)
+	task.PauseLocked()
+}
+
+// This function is called on the first core in the system. It will wake up the
+// other cores when ready.
+func run() {
+	initHeap()
+
+	go func() {
+		// Package initializers are currently run single-threaded.
+		// This might help with registering interrupts and such.
+		initAll()
+
+		// After package initializers have finished, start all the other cores.
+		startSecondaryCores()
+
+		// Run main.main.
+		callMain()
+
+		// main.main has exited, so the program should exit.
+		mainExited.Store(1)
+	}()
+
+	// The scheduler must always be entered while the scheduler lock is taken.
+	schedulerLock.Lock()
+	scheduler(false)
+	schedulerLock.Unlock()
+}
+
+func scheduler(_ bool) {
+	for mainExited.Load() == 0 {
+		// Check for ready-to-run tasks.
+		if runnable := runqueue.Pop(); runnable != nil {
+			// Resume it now.
+			setCurrentTask(runnable)
+			runnable.RunState = task.RunStateRunning
+			schedulerLock.Unlock() // unlock before resuming, Pause() will lock again
+			runnable.Resume()
+			setCurrentTask(nil)
+
+			continue
+		}
+
+		var now timeUnit
+		if sleepQueue != nil || timerQueue != nil {
+			now = ticks()
+
+			// Check whether the first task in the sleep queue is ready to run.
+			if sleepingTask := sleepQueue; sleepingTask != nil && now >= timeUnit(sleepingTask.Data) {
+				// It is, pop it from the queue.
+				sleepQueue = sleepQueue.Next
+				sleepingTask.Next = nil
+
+				// Run it now.
+				setCurrentTask(sleepingTask)
+				sleepingTask.RunState = task.RunStateRunning
+				schedulerLock.Unlock() // unlock before resuming, Pause() will lock again
+				sleepingTask.Resume()
+				setCurrentTask(nil)
+				continue
+			}
+
+			// Check whether a timer has expired that needs to be run.
+			if timerQueue != nil && now >= timerQueue.whenTicks() {
+				delay := ticksToNanoseconds(now - timerQueue.whenTicks())
+				// Pop timer from queue.
+				tn := timerQueue
+				timerQueue = tn.next
+				tn.next = nil
+
+				// Run the callback stored in this timer node.
+				schedulerLock.Unlock()
+				tn.callback(tn, delay)
+				schedulerLock.Lock()
+				continue
+			}
+		}
+
+		// At this point, there are no runnable tasks anymore.
+		// If another core is using the clock, let it handle the sleep queue.
+		if hasSleepingCore() {
+			schedulerUnlockAndWait()
+			continue
+		}
+
+		// The timer is free to use, so check whether there are any future
+		// tasks/timers that we can wait for.
+		var timeLeft timeUnit
+		if sleepingTask := sleepQueue; sleepingTask != nil {
+			// We already checked that there is no ready-to-run sleeping task
+			// (using the same 'now' value), so timeLeft will always be
+			// positive.
+			timeLeft = timeUnit(sleepingTask.Data) - now
+		}
+		if timerQueue != nil {
+			// If the timer queue needs to run earlier, reduce the time we are
+			// going to sleep.
+			// Like with sleepQueue, we already know there is no timer ready to
+			// run since we already checked above.
+			timeLeftForTimer := timerQueue.whenTicks() - now
+			if sleepQueue == nil || timeLeftForTimer < timeLeft {
+				timeLeft = timeLeftForTimer
+			}
+		}
+
+		if timeLeft > 0 {
+			// Sleep for a bit until the next task or timer is ready to run.
+			sleepTicksMulticore(timeLeft)
+			continue
+		}
+
+		// No runnable tasks and no sleeping tasks or timers. There's nothing to
+		// do.
+		// Wait until something happens (like an interrupt).
+		schedulerUnlockAndWait()
+	}
+}
+
+func currentTask() *task.Task {
+	return cpuTasks[currentCPU()]
+}
+
+func setCurrentTask(task *task.Task) {
+	cpuTasks[currentCPU()] = task
+}
+
+func lockScheduler() {
+	schedulerLock.Lock()
+}
+
+func unlockScheduler() {
+	schedulerLock.Unlock()
+}
+
+func lockFutex() interrupt.State {
+	mask := interrupt.Disable()
+	futexLock.Lock()
+	return mask
+}
+
+func unlockFutex(state interrupt.State) {
+	futexLock.Unlock()
+	interrupt.Restore(state)
+}
+
+// Use a single spinlock for atomics. This works fine, since atomics are very
+// short sequences of instructions.
+func lockAtomics() interrupt.State {
+	mask := interrupt.Disable()
+	atomicsLock.Lock()
+	return mask
+}
+
+func unlockAtomics(mask interrupt.State) {
+	atomicsLock.Unlock()
+	interrupt.Restore(mask)
+}
+
+var systemStack [numCPU]uintptr
+
+// Implementation detail of the internal/task package.
+// It needs to store the system stack pointer somewhere, and needs to know how
+// many cores there are to do so. But it doesn't know the number of cores. Hence
+// why this is implemented in the runtime.
+func systemStackPtr() *uintptr {
+	return &systemStack[currentCPU()]
+}
+
+// Color the 'print' and 'println' output according to the current CPU.
+// This may be helpful for debugging, but should be disabled otherwise.
+const cpuColoredPrint = false
+
+func printlock() {
+	printLock.Lock()
+	if cpuColoredPrint {
+		switch currentCPU() {
+		case 1:
+			printstring("\x1b[32m") // green
+		case 2:
+			printstring("\x1b[33m") // yellow
+		case 3:
+			printstring("\x1b[34m") // blue
+		}
+	}
+}
+
+func printunlock() {
+	if cpuColoredPrint {
+		if currentCPU() != 0 {
+			printstring("\x1b[0m") // reset colored output
+		}
+	}
+	printLock.Unlock()
+}
diff --git a/src/runtime/scheduler_none.go b/src/runtime/scheduler_none.go
index 1539739714..06722afcf8 100644
--- a/src/runtime/scheduler_none.go
+++ b/src/runtime/scheduler_none.go
@@ -2,7 +2,10 @@
 
 package runtime
 
-import "internal/task"
+import (
+	"internal/task"
+	"runtime/interrupt"
+)
 
 const hasScheduler = false
 
@@ -73,3 +76,19 @@ func scheduler(returnAtDeadlock bool) {
 	// this code should be unreachable.
 	runtimePanic("unreachable: scheduler must not be called with the 'none' scheduler")
 }
+
+func lockAtomics() interrupt.State {
+	return interrupt.Disable()
+}
+
+func unlockAtomics(mask interrupt.State) {
+	interrupt.Restore(mask)
+}
+
+func printlock() {
+	// nothing to do
+}
+
+func printunlock() {
+	// nothing to do
+}
diff --git a/src/runtime/scheduler_tasks.go b/src/runtime/scheduler_tasks.go
new file mode 100644
index 0000000000..b65339375f
--- /dev/null
+++ b/src/runtime/scheduler_tasks.go
@@ -0,0 +1,13 @@
+//go:build scheduler.tasks
+
+package runtime
+
+var systemStack uintptr
+
+// Implementation detail of the internal/task package.
+// It needs to store the system stack pointer somewhere, and needs to know how
+// many cores there are to do so. But it doesn't know the number of cores. Hence
+// why this is implemented in the runtime.
+func systemStackPtr() *uintptr {
+	return &systemStack
+}
diff --git a/src/runtime/scheduler_threads.go b/src/runtime/scheduler_threads.go
index 32b9caaf21..6d41d0c99e 100644
--- a/src/runtime/scheduler_threads.go
+++ b/src/runtime/scheduler_threads.go
@@ -2,7 +2,10 @@
 
 package runtime
 
-import "internal/task"
+import (
+	"internal/task"
+	"runtime/interrupt"
+)
 
 const hasScheduler = false // not using the cooperative scheduler
 
@@ -127,3 +130,30 @@ func runqueueForGC() *task.Queue {
 	// There is only a runqueue when using the cooperative scheduler.
 	return nil
 }
+
+// Lock to make sure print calls do not interleave.
+var printLock task.Mutex
+
+func printlock() {
+	printLock.Lock()
+}
+
+func printunlock() {
+	printLock.Unlock()
+}
+
+// The atomics lock isn't used as a lock for actual atomics. It is used inside
+// internal/task.Stack and internal/task.Queue to make sure their operations are
+// actually atomic. (This might not actually be needed, since the use in
+// sync.Cond doesn't need atomicity).
+
+var atomicsLock task.Mutex
+
+func lockAtomics() interrupt.State {
+	atomicsLock.Lock()
+	return 0
+}
+
+func unlockAtomics(mask interrupt.State) {
+	atomicsLock.Unlock()
+}
diff --git a/targets/riscv-qemu.json b/targets/riscv-qemu.json
index 8a85cf9a72..318089332b 100644
--- a/targets/riscv-qemu.json
+++ b/targets/riscv-qemu.json
@@ -1,8 +1,21 @@
 {
-	"inherits": ["riscv32"],
-	"features": "+32bit,+a,+c,+m,+zmmul,-b,-d,-e,-experimental-smmpm,-experimental-smnpm,-experimental-ssnpm,-experimental-sspm,-experimental-ssqosid,-experimental-supm,-experimental-zacas,-experimental-zalasr,-experimental-zicfilp,-experimental-zicfiss,-f,-h,-relax,-shcounterenw,-shgatpa,-shtvala,-shvsatpa,-shvstvala,-shvstvecd,-smaia,-smcdeleg,-smcsrind,-smepmp,-smstateen,-ssaia,-ssccfg,-ssccptr,-sscofpmf,-sscounterenw,-sscsrind,-ssstateen,-ssstrict,-sstc,-sstvala,-sstvecd,-ssu64xl,-svade,-svadu,-svbare,-svinval,-svnapot,-svpbmt,-v,-xcvalu,-xcvbi,-xcvbitmanip,-xcvelw,-xcvmac,-xcvmem,-xcvsimd,-xesppie,-xsfcease,-xsfvcp,-xsfvfnrclipxfqf,-xsfvfwmaccqqq,-xsfvqmaccdod,-xsfvqmaccqoq,-xsifivecdiscarddlone,-xsifivecflushdlone,-xtheadba,-xtheadbb,-xtheadbs,-xtheadcmo,-xtheadcondmov,-xtheadfmemidx,-xtheadmac,-xtheadmemidx,-xtheadmempair,-xtheadsync,-xtheadvdot,-xventanacondops,-xwchc,-za128rs,-za64rs,-zaamo,-zabha,-zalrsc,-zama16b,-zawrs,-zba,-zbb,-zbc,-zbkb,-zbkc,-zbkx,-zbs,-zca,-zcb,-zcd,-zce,-zcf,-zcmop,-zcmp,-zcmt,-zdinx,-zfa,-zfbfmin,-zfh,-zfhmin,-zfinx,-zhinx,-zhinxmin,-zic64b,-zicbom,-zicbop,-zicboz,-ziccamoa,-ziccif,-zicclsm,-ziccrse,-zicntr,-zicond,-zicsr,-zifencei,-zihintntl,-zihintpause,-zihpm,-zimop,-zk,-zkn,-zknd,-zkne,-zknh,-zkr,-zks,-zksed,-zksh,-zkt,-ztso,-zvbb,-zvbc,-zve32f,-zve32x,-zve64d,-zve64f,-zve64x,-zvfbfmin,-zvfbfwma,-zvfh,-zvfhmin,-zvkb,-zvkg,-zvkn,-zvknc,-zvkned,-zvkng,-zvknha,-zvknhb,-zvks,-zvksc,-zvksed,-zvksg,-zvksh,-zvkt,-zvl1024b,-zvl128b,-zvl16384b,-zvl2048b,-zvl256b,-zvl32768b,-zvl32b,-zvl4096b,-zvl512b,-zvl64b,-zvl65536b,-zvl8192b",
-	"build-tags": ["virt", "qemu"],
+	"inherits": [
+		"riscv32"
+	],
+	"features": "+32bit,+a,+c,+m,+zihintpause,+zmmul,-b,-d,-e,-experimental-smmpm,-experimental-smnpm,-experimental-ssnpm,-experimental-sspm,-experimental-ssqosid,-experimental-supm,-experimental-zacas,-experimental-zalasr,-experimental-zicfilp,-experimental-zicfiss,-f,-h,-relax,-shcounterenw,-shgatpa,-shtvala,-shvsatpa,-shvstvala,-shvstvecd,-smaia,-smcdeleg,-smcsrind,-smepmp,-smstateen,-ssaia,-ssccfg,-ssccptr,-sscofpmf,-sscounterenw,-sscsrind,-ssstateen,-ssstrict,-sstc,-sstvala,-sstvecd,-ssu64xl,-svade,-svadu,-svbare,-svinval,-svnapot,-svpbmt,-v,-xcvalu,-xcvbi,-xcvbitmanip,-xcvelw,-xcvmac,-xcvmem,-xcvsimd,-xesppie,-xsfcease,-xsfvcp,-xsfvfnrclipxfqf,-xsfvfwmaccqqq,-xsfvqmaccdod,-xsfvqmaccqoq,-xsifivecdiscarddlone,-xsifivecflushdlone,-xtheadba,-xtheadbb,-xtheadbs,-xtheadcmo,-xtheadcondmov,-xtheadfmemidx,-xtheadmac,-xtheadmemidx,-xtheadmempair,-xtheadsync,-xtheadvdot,-xventanacondops,-xwchc,-za128rs,-za64rs,-zaamo,-zabha,-zalrsc,-zama16b,-zawrs,-zba,-zbb,-zbc,-zbkb,-zbkc,-zbkx,-zbs,-zca,-zcb,-zcd,-zce,-zcf,-zcmop,-zcmp,-zcmt,-zdinx,-zfa,-zfbfmin,-zfh,-zfhmin,-zfinx,-zhinx,-zhinxmin,-zic64b,-zicbom,-zicbop,-zicboz,-ziccamoa,-ziccif,-zicclsm,-ziccrse,-zicntr,-zicond,-zicsr,-zifencei,-zihintntl,-zihpm,-zimop,-zk,-zkn,-zknd,-zkne,-zknh,-zkr,-zks,-zksed,-zksh,-zkt,-ztso,-zvbb,-zvbc,-zve32f,-zve32x,-zve64d,-zve64f,-zve64x,-zvfbfmin,-zvfbfwma,-zvfh,-zvfhmin,-zvkb,-zvkg,-zvkn,-zvknc,-zvkned,-zvkng,-zvknha,-zvknhb,-zvks,-zvksc,-zvksed,-zvksg,-zvksh,-zvkt,-zvl1024b,-zvl128b,-zvl16384b,-zvl2048b,-zvl256b,-zvl32768b,-zvl32b,-zvl4096b,-zvl512b,-zvl64b,-zvl65536b,-zvl8192b",
+	"build-tags": [
+		"virt",
+		"qemu"
+	],
+	"scheduler": "cores",
 	"default-stack-size": 8192,
+	"cflags": [
+		"-march=rv32imaczihintpause",
+		"-DTINYGO_CORES=4"
+	],
+	"ldflags": [
+		"--defsym=__num_stacks=4"
+	],
 	"linkerscript": "targets/riscv-qemu.ld",
-	"emulator": "qemu-system-riscv32 -machine virt,aclint=on -nographic -bios none -device virtio-rng-device -kernel {}"
+	"emulator": "qemu-system-riscv32 -machine virt,aclint=on -smp 4 -nographic -bios none -device virtio-rng-device -kernel {}"
 }
diff --git a/targets/riscv.ld b/targets/riscv.ld
index eecac6b476..a668d1d6d6 100644
--- a/targets/riscv.ld
+++ b/targets/riscv.ld
@@ -16,13 +16,34 @@ SECTIONS
     /* Put the stack at the bottom of RAM, so that the application will
      * crash on stack overflow instead of silently corrupting memory.
      * See: http://blog.japaric.io/stack-overflow-protection/ */
-    .stack (NOLOAD) :
+    .stack0 (NOLOAD) :
     {
         . = ALIGN(16);
         . += _stack_size;
         _stack_top = .;
     } >RAM
 
+    .stack1 (NOLOAD) :
+    {
+        . = ALIGN(16);
+        . += DEFINED(__num_stacks) && __num_stacks >= 2 ? _stack_size : 0;
+        _stack1_top = .;
+    } >RAM
+
+    .stack2 (NOLOAD) :
+    {
+        . = ALIGN(16);
+        . += DEFINED(__num_stacks) && __num_stacks >= 3 ? _stack_size : 0;
+        _stack2_top = .;
+    } >RAM
+
+    .stack3 (NOLOAD) :
+    {
+        . = ALIGN(16);
+        . += DEFINED(__num_stacks) && __num_stacks >= 4 ? _stack_size : 0;
+        _stack3_top = .;
+    } >RAM
+
     /* Start address (in flash) of .data, used by startup code. */
     _sidata = LOADADDR(.data);
 
diff --git a/tools/gen-critical-atomics/gen-critical-atomics.go b/tools/gen-critical-atomics/gen-critical-atomics.go
index 75ea327076..98ceebb020 100644
--- a/tools/gen-critical-atomics/gen-critical-atomics.go
+++ b/tools/gen-critical-atomics/gen-critical-atomics.go
@@ -26,7 +26,6 @@ package runtime
 
 import (
 	_ "unsafe"
-	"runtime/interrupt"
 )
 
 // Documentation:
@@ -41,29 +40,29 @@ import (
 func __atomic_load_{{.}}(ptr *uint{{$bits}}, ordering uintptr) uint{{$bits}} {
 	// The LLVM docs for this say that there is a val argument after the pointer.
 	// That is a typo, and the GCC docs omit it.
-	mask := interrupt.Disable()
+	mask := lockAtomics()
 	val := *ptr
-	interrupt.Restore(mask)
+	unlockAtomics(mask)
 	return val
 }
 {{end}}
 {{- define "store"}}{{$bits := mul . 8 -}}
 //export __atomic_store_{{.}}
 func __atomic_store_{{.}}(ptr *uint{{$bits}}, val uint{{$bits}}, ordering uintptr) {
-	mask := interrupt.Disable()
+	mask := lockAtomics()
 	*ptr = val
-	interrupt.Restore(mask)
+	unlockAtomics(mask)
 }
 {{end}}
 {{- define "cas"}}{{$bits := mul . 8 -}}
 //go:inline
 func doAtomicCAS{{$bits}}(ptr *uint{{$bits}}, expected, desired uint{{$bits}}) uint{{$bits}} {
-	mask := interrupt.Disable()
+	mask := lockAtomics()
 	old := *ptr
 	if old == expected {
 		*ptr = desired
 	}
-	interrupt.Restore(mask)
+	unlockAtomics(mask)
 	return old
 }
 
@@ -82,10 +81,10 @@ func __atomic_compare_exchange_{{.}}(ptr, expected *uint{{$bits}}, desired uint{
 {{- define "swap"}}{{$bits := mul . 8 -}}
 //go:inline
 func doAtomicSwap{{$bits}}(ptr *uint{{$bits}}, new uint{{$bits}}) uint{{$bits}} {
-	mask := interrupt.Disable()
+	mask := lockAtomics()
 	old := *ptr
 	*ptr = new
-	interrupt.Restore(mask)
+	unlockAtomics(mask)
 	return old
 }
 
@@ -111,11 +110,11 @@ func __atomic_exchange_{{.}}(ptr *uint{{$bits}}, new uint{{$bits}}, ordering uin
 
 //go:inline
 func {{$opfn}}(ptr *{{$type}}, value {{$type}}) (old, new {{$type}}) {
-	mask := interrupt.Disable()
+	mask := lockAtomics()
 	old = *ptr
 	{{$opdef}}
 	*ptr = new
-	interrupt.Restore(mask)
+	unlockAtomics(mask)
 	return old, new
 }