Skip to content

Commit f905812

Browse files
authored
Merge pull request #2036 from cardstack/cs-7805-update-deploy-scripts-to-deploy-code-to-worker-ecs-container
Multiple worker support
2 parents 0b30125 + 348026b commit f905812

23 files changed

+550
-96
lines changed

.github/workflows/ci.yaml

+3
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,9 @@ jobs:
9999
matrix:
100100
shardIndex: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
101101
shardTotal: [12]
102+
concurrency:
103+
group: matrix-client-test-${{ matrix.shardIndex }}-${{ github.head_ref || github.run_id }}
104+
cancel-in-progress: true
102105
steps:
103106
- uses: actions/checkout@v4
104107
- uses: ./.github/actions/init

.github/workflows/manual-deploy.yml

+41-3
Original file line numberDiff line numberDiff line change
@@ -74,10 +74,21 @@ jobs:
7474
with:
7575
repository: "boxel-realm-server-${{ inputs.environment }}"
7676
environment: ${{ inputs.environment }}
77-
dockerfile: "packages/realm-server/Dockerfile"
77+
dockerfile: "packages/realm-server/realm-server.Dockerfile"
7878
build-args: |
7979
"realm_server_script=start:${{ inputs.environment }}"
8080
81+
build-worker:
82+
name: Build worker Docker image
83+
uses: cardstack/gh-actions/.github/workflows/docker-ecr.yml@main
84+
secrets: inherit
85+
with:
86+
repository: "boxel-worker-${{ inputs.environment }}"
87+
environment: ${{ inputs.environment }}
88+
dockerfile: "packages/realm-server/worker.Dockerfile"
89+
build-args: |
90+
"worker_script=start:worker-${{ inputs.environment }}"
91+
8192
build-pg-migration:
8293
name: Build pg-migration Docker image
8394
uses: cardstack/gh-actions/.github/workflows/docker-ecr.yml@main
@@ -103,16 +114,43 @@ jobs:
103114
image: ${{ needs.build-pg-migration.outputs.image }}
104115
wait-for-service-stability: false
105116

117+
# the wait-for-service-stability flag doesn't seem to work in
118+
# aws-actions/amazon-ecs-deploy-task-definition@v2. we keep getting timeouts
119+
# waiting for service stability. So we are manually waiting here.
106120
post-migrate-db:
107121
name: Wait for db-migration
108122
needs: [migrate-db]
109123
runs-on: ubuntu-latest
110124
steps:
111-
- run: sleep 240
125+
- run: sleep 180
126+
127+
deploy-worker:
128+
name: Deploy worker
129+
needs: [build-worker, deploy-host, post-migrate-db]
130+
uses: cardstack/gh-actions/.github/workflows/ecs-deploy.yml@main
131+
secrets: inherit
132+
with:
133+
container-name: "boxel-worker"
134+
environment: ${{ inputs.environment }}
135+
cluster: ${{ inputs.environment }}
136+
service-name: "boxel-worker-${{ inputs.environment }}"
137+
image: ${{ needs.build-worker.outputs.image }}
138+
wait-for-service-stability: false
139+
140+
# the wait-for-service-stability flag doesn't seem to work in
141+
# aws-actions/amazon-ecs-deploy-task-definition@v2. we keep getting timeouts
142+
# waiting for service stability. So we are manually waiting here.
143+
post-deploy-worker:
144+
name: Wait for worker
145+
needs: [deploy-worker]
146+
runs-on: ubuntu-latest
147+
steps:
148+
- run: sleep 180
112149

113150
deploy-realm-server:
114151
name: Deploy realm server
115-
needs: [build-realm-server, deploy-host, post-migrate-db]
152+
needs:
153+
[post-deploy-worker, build-realm-server, deploy-host, post-migrate-db]
116154
uses: cardstack/gh-actions/.github/workflows/ecs-deploy.yml@main
117155
secrets: inherit
118156
with:

README.md

+6-2
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@ Live reloads are not available in this mode, however, if you use start the serve
7373

7474
#### Using `start:all`
7575

76-
Instead of running `pnpm start:base`, you can alternatively use `pnpm start:all` which also serves a few other realms on other ports--this is convenient if you wish to switch between the app and the tests without having to restart servers. Here's what is spun up with `start:all`:
76+
Instead of running `pnpm start:base`, you can alternatively use `pnpm start:all` which also serves a few other realms on other ports--this is convenient if you wish to switch between the app and the tests without having to restart servers. Use the environment variable `WORKER_COUNT` to add additional workers. By default there is 1 worker for each realm server. Here's what is spun up with `start:all`:
7777

7878
| Port | Description | Running `start:all` | Running `start:base` |
7979
| ----- | ------------------------------------------------------------- | ------------------- | -------------------- |
@@ -82,13 +82,17 @@ Instead of running `pnpm start:base`, you can alternatively use `pnpm start:all`
8282
| :4201 | `/seed` seed realm || 🚫 |
8383
| :4202 | `/test` host test realm, `/node-test` node test realm || 🚫 |
8484
| :4205 | `/test` realm for matrix client tests (playwright controlled) | 🚫 | 🚫 |
85+
| :4210 | Development Worker Manager (spins up 1 worker by default) || 🚫 |
86+
| :4211 | Test Worker Manager (spins up 1 worker by default) || 🚫 |
87+
| :4212 | Test Worker Manager for matrix client tests (playwright controlled - 1 worker) || 🚫 |
88+
| :4213 | Test Worker Manager for matrix client tests - base realm server (playwright controlled - 1 worker) || 🚫 |
8589
| :5001 | Mail user interface for viewing emails sent to local SMTP || 🚫 |
8690
| :5435 | Postgres DB || 🚫 |
8791
| :8008 | Matrix synapse server || 🚫 |
8892

8993
#### Using `start:development`
9094

91-
You can also use `start:development` if you want the functionality of `start:all`, but without running the test realms. `start:development` will enable you to open http://localhost:4201 and allow to select between the cards in the /base and /experiments realm.
95+
You can also use `start:development` if you want the functionality of `start:all`, but without running the test realms. `start:development` will enable you to open http://localhost:4201 and allow to select between the cards in the /base and /experiments realm. In order to use `start:development` you must also make sure to run `start:worker-development` in order to start the workers (which are normally started in `start:all`.
9296

9397
### Card Pre-rendering
9498

packages/matrix/helpers/isolated-realm-server.ts

+62-9
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,36 @@ export async function startServer() {
3131
process.env.MATRIX_URL = 'http://localhost:8008';
3232
process.env.REALM_SERVER_MATRIX_USERNAME = 'realm_server';
3333

34+
let workerManager = spawn(
35+
'ts-node',
36+
[
37+
`--transpileOnly`,
38+
'worker-manager',
39+
`--port=4212`,
40+
`--matrixURL='http://localhost:8008'`,
41+
`--distURL="${process.env.HOST_URL ?? 'http://localhost:4200'}"`,
42+
43+
`--fromUrl='http://localhost:4205/test/'`,
44+
`--toUrl='http://localhost:4205/test/'`,
45+
`--fromUrl='https://cardstack.com/base/'`,
46+
`--toUrl='http://localhost:4201/base/'`,
47+
],
48+
{
49+
cwd: realmServerDir,
50+
stdio: ['pipe', 'pipe', 'pipe', 'ipc'],
51+
},
52+
);
53+
if (workerManager.stdout) {
54+
workerManager.stdout.on('data', (data: Buffer) =>
55+
console.log(`worker: ${data.toString()}`),
56+
);
57+
}
58+
if (workerManager.stderr) {
59+
workerManager.stderr.on('data', (data: Buffer) =>
60+
console.error(`worker: ${data.toString()}`),
61+
);
62+
}
63+
3464
let realmServer = spawn(
3565
'ts-node',
3666
[
@@ -40,13 +70,14 @@ export async function startServer() {
4070
`--matrixURL='http://localhost:8008'`,
4171
`--realmsRootPath='${dir.name}'`,
4272
`--seedPath='${seedPath}'`,
73+
`--workerManagerPort=4212`,
4374
`--migrateDB`,
4475
`--useRegistrationSecretFunction`,
4576

4677
`--path='${testRealmDir}'`,
4778
`--username='test_realm'`,
48-
`--fromUrl='/test/'`,
49-
`--toUrl='/test/'`,
79+
`--fromUrl='http://localhost:4205/test/'`,
80+
`--toUrl='http://localhost:4205/test/'`,
5081
`--fromUrl='https://cardstack.com/base/'`,
5182
`--toUrl='http://localhost:4201/base/'`,
5283
],
@@ -55,6 +86,7 @@ export async function startServer() {
5586
stdio: ['pipe', 'pipe', 'pipe', 'ipc'],
5687
},
5788
);
89+
realmServer.unref();
5890
if (realmServer.stdout) {
5991
realmServer.stdout.on('data', (data: Buffer) =>
6092
console.log(`realm server: ${data.toString()}`),
@@ -91,25 +123,36 @@ export async function startServer() {
91123
);
92124
}
93125

94-
return new IsolatedRealmServer(realmServer, testRealmDir);
126+
return new IsolatedRealmServer(realmServer, workerManager, testRealmDir);
95127
}
96128

97129
export class IsolatedRealmServer {
98-
private stopped: (() => void) | undefined;
130+
private realmServerStopped: (() => void) | undefined;
131+
private workerManagerStopped: (() => void) | undefined;
99132
private sqlResults: ((results: string) => void) | undefined;
100133
private sqlError: ((error: string) => void) | undefined;
101134

102135
constructor(
103136
private realmServerProcess: ReturnType<typeof spawn>,
137+
private workerManagerProcess: ReturnType<typeof spawn>,
104138
readonly realmPath: string, // useful for debugging
105139
) {
140+
workerManagerProcess.on('message', (message) => {
141+
if (message === 'stopped') {
142+
if (!this.workerManagerStopped) {
143+
console.error(`received unprompted worker manager stop`);
144+
return;
145+
}
146+
this.workerManagerStopped();
147+
}
148+
});
106149
realmServerProcess.on('message', (message) => {
107150
if (message === 'stopped') {
108-
if (!this.stopped) {
151+
if (!this.realmServerStopped) {
109152
console.error(`received unprompted server stop`);
110153
return;
111154
}
112-
this.stopped();
155+
this.realmServerStopped();
113156
} else if (
114157
typeof message === 'string' &&
115158
message.startsWith('sql-results:')
@@ -149,10 +192,20 @@ export class IsolatedRealmServer {
149192
}
150193

151194
async stop() {
152-
let stop = new Promise<void>((r) => (this.stopped = r));
195+
let realmServerStop = new Promise<void>(
196+
(r) => (this.realmServerStopped = r),
197+
);
153198
this.realmServerProcess.send('stop');
154-
await stop;
155-
this.stopped = undefined;
199+
await realmServerStop;
200+
this.realmServerStopped = undefined;
156201
this.realmServerProcess.send('kill');
202+
203+
let workerManagerStop = new Promise<void>(
204+
(r) => (this.workerManagerStopped = r),
205+
);
206+
this.workerManagerProcess.send('stop');
207+
await workerManagerStop;
208+
this.workerManagerStopped = undefined;
209+
this.workerManagerProcess.send('kill');
157210
}
158211
}

packages/realm-server/main.ts

+54-49
Original file line numberDiff line numberDiff line change
@@ -11,13 +11,12 @@ import { NodeAdapter } from './node-realm';
1111
import yargs from 'yargs';
1212
import { RealmServer } from './server';
1313
import { resolve } from 'path';
14-
import { spawn } from 'child_process';
14+
import { createConnection, type Socket } from 'net';
1515
import { makeFastBootIndexRunner } from './fastboot';
1616
import { shimExternals } from './lib/externals';
1717
import * as Sentry from '@sentry/node';
1818
import { PgAdapter, PgQueuePublisher } from '@cardstack/postgres';
1919
import { MatrixClient } from '@cardstack/runtime-common/matrix-client';
20-
import flattenDeep from 'lodash/flattenDeep';
2120
import 'decorator-transforms/globals';
2221

2322
let log = logger('main');
@@ -68,6 +67,7 @@ let {
6867
useRegistrationSecretFunction,
6968
seedPath,
7069
migrateDB,
70+
workerManagerPort,
7171
} = yargs(process.argv.slice(2))
7272
.usage('Start realm server')
7373
.options({
@@ -130,6 +130,11 @@ let {
130130
'The flag should be set when running matrix tests where the synapse instance is torn down and restarted multiple times during the life of the realm server.',
131131
type: 'boolean',
132132
},
133+
workerManagerPort: {
134+
description:
135+
'The port the worker manager is running on. used to wait for the workers to be ready',
136+
type: 'number',
137+
},
133138
})
134139
.parseSync();
135140

@@ -165,8 +170,8 @@ let virtualNetwork = new VirtualNetwork();
165170
shimExternals(virtualNetwork);
166171

167172
let urlMappings = fromUrls.map((fromUrl, i) => [
168-
new URL(String(fromUrl), `http://localhost:${port}`),
169-
new URL(String(toUrls[i]), `http://localhost:${port}`),
173+
new URL(String(fromUrl)),
174+
new URL(String(toUrls[i])),
170175
]);
171176
for (let [from, to] of urlMappings) {
172177
virtualNetwork.addURLMapping(from, to);
@@ -185,7 +190,9 @@ let autoMigrate = migrateDB || undefined;
185190
manager.getOptions.bind(manager),
186191
);
187192

188-
await startWorker({ autoMigrate });
193+
if (workerManagerPort != null) {
194+
await waitForWorkerManager(workerManagerPort);
195+
}
189196

190197
for (let [i, path] of paths.entries()) {
191198
let url = hrefs[i][0];
@@ -324,51 +331,49 @@ let autoMigrate = migrateDB || undefined;
324331
process.exit(-3);
325332
});
326333

327-
async function startWorker(opts?: { autoMigrate?: true }) {
328-
let worker = spawn(
329-
'ts-node',
330-
[
331-
'--transpileOnly',
332-
'worker',
333-
`--port=${port}`,
334-
`--matrixURL='${matrixURL}'`,
335-
`--distURL='${distURL}'`,
336-
...(opts?.autoMigrate ? [`--migrateDB`] : []),
337-
...flattenDeep(
338-
urlMappings.map(([from, to]) => [
339-
`--fromUrl='${from}'`,
340-
`--toUrl='${to}'`,
341-
]),
342-
),
343-
],
344-
{
345-
stdio: ['pipe', 'pipe', 'pipe', 'ipc'],
346-
},
347-
);
334+
let workerReadyDeferred: Deferred<boolean> | undefined;
335+
async function waitForWorkerManager(port: number) {
336+
const workerManager = await new Promise<Socket>((r) => {
337+
let socket = createConnection({ port }, () => {
338+
log.info(`Connected to worker manager on port ${port}`);
339+
r(socket);
340+
});
341+
});
348342

349-
if (worker.stdout) {
350-
worker.stdout.on('data', (data: Buffer) =>
351-
log.info(`worker: ${data.toString()}`),
352-
);
353-
}
354-
if (worker.stderr) {
355-
worker.stderr.on('data', (data: Buffer) =>
356-
console.error(`worker: ${data.toString()}`),
357-
);
358-
}
343+
workerManager.on('data', (data) => {
344+
let res = data.toString();
345+
if (!workerReadyDeferred) {
346+
throw new Error(
347+
`received unsolicited message from worker manager on port ${port}`,
348+
);
349+
}
350+
switch (res) {
351+
case 'ready':
352+
case 'not-ready':
353+
workerReadyDeferred.fulfill(res === 'ready' ? true : false);
354+
break;
355+
default:
356+
workerReadyDeferred.reject(
357+
`unexpected response from worker manager: ${res}`,
358+
);
359+
}
360+
});
359361

360-
let timeout = await Promise.race([
361-
new Promise<void>((r) => {
362-
worker.on('message', (message) => {
363-
if (message === 'ready') {
364-
r();
365-
}
366-
});
367-
}),
368-
new Promise<true>((r) => setTimeout(() => r(true), 30_000)),
369-
]);
370-
if (timeout) {
371-
console.error(`timed-out waiting for worker to start. Stopping server`);
372-
process.exit(-2);
362+
try {
363+
let isReady = false;
364+
let timeout = Date.now() + 30_000;
365+
do {
366+
workerReadyDeferred = new Deferred();
367+
workerManager.write('ready?');
368+
isReady = await workerReadyDeferred.promise;
369+
} while (!isReady && Date.now() < timeout);
370+
if (!isReady) {
371+
throw new Error(
372+
`timed out trying to connect to worker manager on port ${port}`,
373+
);
374+
}
375+
} finally {
376+
workerManager.end();
373377
}
378+
log.info('workers are ready');
374379
}

0 commit comments

Comments
 (0)