George Kalpakas 966598cda7 fix(ngcc): support recovering when a worker process crashes (#36626)
Previously, when running in parallel mode and a worker process crashed
while processing a task, it was not possible for ngcc to continue
without risking ending up with a corrupted entry-point and therefore it
exited with an error. This, for example, could happen when a worker
process received a `SIGKILL` signal, which was frequently observed in CI
environments. This was probably the result of Docker killing processes
due to increased memory pressure.

One factor that amplifies the problem under Docker (which is often used
in CI) is that it is not possible to distinguish between the available
CPU cores on the host machine and the ones made available to Docker
containers, thus resulting in ngcc spawning too many worker processes.

This commit addresses these issues in the following ways:

1. We take advantage of the fact that files are written to disk only
   after an entry-point has been fully analyzed/compiled. The master
   process can now determine whether a worker process has not yet
   started writing files to disk (even if it was in the middle of
   processing a task) and just put the task back into the tasks queue if
   the worker process crashes.

2. The master process keeps track of the transformed files that a worker
   process will attempt to write to disk. If the worker process crashes
   while writing files, the master process can revert any changes and
   put the task back into the tasks queue (without risking corruption).

3. When a worker process crashes while processing a task (which can be a
   result of increased memory pressure or too many worker processes),
   the master process will not try to re-spawn it. This way the number
   or worker processes is gradually adjusted to a level that can be
   accomodated by the system's resources.

Examples of ngcc being able to recover after a worker process crashed:
- While idling: https://circleci.com/gh/angular/angular/682197
- While compiling: https://circleci.com/gh/angular/angular/682209
- While writing files: https://circleci.com/gh/angular/angular/682267

Jira issue: [FW-2008](https://angular-team.atlassian.net/browse/FW-2008)

Fixes #36278

PR Close #36626
2020-04-29 14:28:26 -07:00

137 lines
5.2 KiB
TypeScript

/**
* @license
* Copyright Google Inc. All Rights Reserved.
*
* Use of this source code is governed by an MIT-style license that can be
* found in the LICENSE file at https://angular.io/license
*/
/// <reference types="node" />
import {getFileSystem} from '@angular/compiler-cli/src/ngtsc/file_system';
import * as cluster from 'cluster';
import {MockFileSystemNative, runInEachFileSystem} from '../../../../src/ngtsc/file_system/testing';
import {ClusterExecutor} from '../../../src/execution/cluster/executor';
import {ClusterMaster} from '../../../src/execution/cluster/master';
import {AsyncLocker} from '../../../src/locking/async_locker';
import {FileWriter} from '../../../src/writing/file_writer';
import {PackageJsonUpdater} from '../../../src/writing/package_json_updater';
import {MockLockFile} from '../../helpers/mock_lock_file';
import {MockLogger} from '../../helpers/mock_logger';
import {mockProperty} from '../../helpers/spy_utils';
runInEachFileSystem(() => {
describe('ClusterExecutor', () => {
const runAsClusterMaster = mockProperty(cluster, 'isMaster');
let masterRunSpy: jasmine.Spy;
let mockLogger: MockLogger;
let lockFileLog: string[];
let mockLockFile: MockLockFile;
let locker: AsyncLocker;
let executor: ClusterExecutor;
let createTaskCompletedCallback: jasmine.Spy;
beforeEach(() => {
masterRunSpy = spyOn(ClusterMaster.prototype, 'run')
.and.returnValue(Promise.resolve('CusterMaster#run()' as any));
createTaskCompletedCallback = jasmine.createSpy('createTaskCompletedCallback');
mockLogger = new MockLogger();
lockFileLog = [];
mockLockFile = new MockLockFile(new MockFileSystemNative(), lockFileLog);
locker = new AsyncLocker(mockLockFile, mockLogger, 200, 2);
executor = new ClusterExecutor(
42, getFileSystem(), mockLogger, null as unknown as FileWriter,
null as unknown as PackageJsonUpdater, locker, createTaskCompletedCallback);
});
describe('execute()', () => {
beforeEach(() => runAsClusterMaster(true));
it('should log debug info about the executor', async () => {
const anyFn: () => any = () => undefined;
await executor.execute(anyFn, anyFn);
expect(mockLogger.logs.debug).toEqual([
['Running ngcc on ClusterExecutor (using 42 worker processes).'],
]);
});
it('should delegate to `ClusterMaster#run()`', async () => {
const analyzeEntryPointsSpy = jasmine.createSpy('analyzeEntryPoints');
const createCompilerFnSpy = jasmine.createSpy('createCompilerFn');
expect(await executor.execute(analyzeEntryPointsSpy, createCompilerFnSpy))
.toBe('CusterMaster#run()' as any);
expect(masterRunSpy).toHaveBeenCalledWith();
expect(analyzeEntryPointsSpy).toHaveBeenCalledWith();
expect(createCompilerFnSpy).not.toHaveBeenCalled();
});
it('should call LockFile.write() and LockFile.remove() if master runner completes successfully',
async () => {
const anyFn: () => any = () => undefined;
await executor.execute(anyFn, anyFn);
expect(lockFileLog).toEqual(['write()', 'remove()']);
});
it('should call LockFile.write() and LockFile.remove() if master runner fails', async () => {
const anyFn: () => any = () => undefined;
masterRunSpy.and.returnValue(Promise.reject(new Error('master runner error')));
let error = '';
try {
await executor.execute(anyFn, anyFn);
} catch (e) {
error = e.message;
}
expect(error).toEqual('master runner error');
expect(lockFileLog).toEqual(['write()', 'remove()']);
});
it('should not call master runner if LockFile.write() fails', async () => {
const anyFn: () => any = () => undefined;
spyOn(mockLockFile, 'write').and.callFake(() => {
lockFileLog.push('write()');
throw new Error('LockFile.write() error');
});
executor = new ClusterExecutor(
42, getFileSystem(), mockLogger, null as unknown as FileWriter,
null as unknown as PackageJsonUpdater, locker, createTaskCompletedCallback);
let error = '';
try {
await executor.execute(anyFn, anyFn);
} catch (e) {
error = e.message;
}
expect(error).toEqual('LockFile.write() error');
expect(masterRunSpy).not.toHaveBeenCalled();
});
it('should fail if LockFile.remove() fails', async () => {
const anyFn: () => any = () => undefined;
spyOn(mockLockFile, 'remove').and.callFake(() => {
lockFileLog.push('remove()');
throw new Error('LockFile.remove() error');
});
executor = new ClusterExecutor(
42, getFileSystem(), mockLogger, null as unknown as FileWriter,
null as unknown as PackageJsonUpdater, locker, createTaskCompletedCallback);
let error = '';
try {
await executor.execute(anyFn, anyFn);
} catch (e) {
error = e.message;
}
expect(error).toEqual('LockFile.remove() error');
expect(lockFileLog).toEqual(['write()', 'remove()']);
expect(masterRunSpy).toHaveBeenCalled();
});
});
});
});