diff --git a/config.example.json b/config.example.json index ac57a6282..a0b515593 100644 --- a/config.example.json +++ b/config.example.json @@ -13,6 +13,7 @@ "2014tc16rfcb047", "2014tc16rfpc001", "2014tc16rftn002", + "2014uk16rfop001", "bulgaria", "cordis", "devco", diff --git a/docs/types/README.md b/docs/types/README.md index 1e3daea98..4da61c4a5 100644 --- a/docs/types/README.md +++ b/docs/types/README.md @@ -19,6 +19,9 @@ Here's a list of the transformations made in ETLs around the `Project` model. - [2014tc16rfcb047 - XLS](./etls/2014tc16rfcb047-xls.md) - [2014tc16rfpc001 - XLS](./etls/2014tc16rfpc001-xls.md) - [2014tc16rftn002 - XLS](./etls/2014tc16rftn002-xls.md) +- [2014uk16rfop001 - CSV](./etls/2014uk16rfop001-csv.md) +- [2014uk16rfop001 - ODS](./etls/2014uk16rfop001-ods.md) +- [2014uk16rfop001 - XLS](./etls/2014uk16rfop001-xls.md) - [bulgaria - XLS](./etls/bulgaria-xls.md) - [CORDIS - CSV](./etls/cordis-csv.md) - [DEVCO - XLS](./etls/devco-xls.md) diff --git a/docs/types/etls/2014uk16rfop001-csv.md b/docs/types/etls/2014uk16rfop001-csv.md new file mode 100644 index 000000000..2fa5ade36 --- /dev/null +++ b/docs/types/etls/2014uk16rfop001-csv.md @@ -0,0 +1,162 @@ + + +## 2014uk16rfop001CsvTransform + +Map fields for 2014uk16rfop001 producer, CSV file types + +Example input data: [stub][1] + +Transform function: [implementation details][2] + +### Parameters + +- `record` **[Object][3]** Piece of data to transform before going to harmonized storage. + +Returns **Project** JSON matching the type fields. + +### getBudget + +Preprocess `budget`. + +Input fields taken from the `record` are: + +- `Total project costs �m (eligible project costs only)` +- `% of project funded by EU (Co-financing rate%)` + +#### Parameters + +- `record` **[Object][3]** The row received from parsed file + +Returns **Budget** + +### getDescription + +Preprocess `description`. + +Input fields taken from the `record` are: + +- `Summary of project(max 100 words)` +- `Local enterprise partnership area` + +#### Parameters + +- `record` **[Object][3]** The row received from parsed file + +Returns **[String][4]** + +### getProjectId + +Generates an ID for `project_id`. + +Input fields taken from the `record` are: + +- `Name of project` + +#### Parameters + +- `record` **[Object][3]** The row received from parsed file + +Returns **[String][4]** + +### getLocations + +Preprocess `project_locations`. + +Input fields taken from the `record` are: + +- `Location (postcode)` + +#### Parameters + +- `record` **[Object][3]** The row received from parsed file + +Returns **[Array][5]<[Location][6]>** + +### getThemes + +Preprocess `themes`. + +Input fields taken from the `record` are: + +- `Type and focus support (category of intervention)` + +#### Parameters + +- `record` **[Object][3]** The row received from parsed file + +Returns **[Array][5]<[String][4]>** + +### getThirdParties + +Preprocess `third_parties`. + +Input fields taken from the `record` are: + +- `Recipient of funds(ERDF/ESF beneficiary)` + +#### Parameters + +- `record` **[Object][3]** The row received from parsed file + +Returns **[Array][5]<ThirdParty>** + +### formatDate + +Format date. + +#### Parameters + +- `date` **[Date][7]** + +Returns **[Date][7]** The date formatted into an ISO 8601 date format + +### getTimeframe + +Preprocess `timeframe`. + +Input fields taken from the `record` are: + +- `Start date` +- `End date` + +#### Parameters + +- `record` **[Object][3]** The row received from parsed file + +Returns **Timeframe** + +### getTitle + +Preprocess `title`. + +Input fields taken from the `record` are: + +- `Name of project` + +#### Parameters + +- `record` **[Object][3]** The row received from parsed file + +Returns **[String][4]** + +### getType + +Preprocess `type`. + +Input fields taken from the `record` are: + +- `Type of fund` + +#### Parameters + +- `record` **[Object][3]** The row received from parsed file + +Returns **[Array][5]<[String][4]>** + +[1]: https://github.com/ec-europa/eubfr-data-lake/blob/master/services/ingestion/etl/2014uk16rfop001/csv/test/stubs/record.json +[2]: https://github.com/ec-europa/eubfr-data-lake/blob/master/services/ingestion/etl/2014uk16rfop001/csv/src/lib/transform.js +[3]: https://developer.mozilla.org/docs/Web/JavaScript/Reference/Global_Objects/Object +[4]: https://developer.mozilla.org/docs/Web/JavaScript/Reference/Global_Objects/String +[5]: https://developer.mozilla.org/docs/Web/JavaScript/Reference/Global_Objects/Array +[6]: https://developer.mozilla.org/docs/Web/API/Location +[7]: https://developer.mozilla.org/docs/Web/JavaScript/Reference/Global_Objects/Date diff --git a/docs/types/etls/2014uk16rfop001-ods.md b/docs/types/etls/2014uk16rfop001-ods.md new file mode 100644 index 000000000..335f6a0ff --- /dev/null +++ b/docs/types/etls/2014uk16rfop001-ods.md @@ -0,0 +1,109 @@ + + +## 2014uk16rfop001XlsTransform + +Map fields for 2014uk16rfop001 producer, XLS file types + +Example input data: [stub][1] + +Transform function: [implementation details][2] + +### Parameters + +- `record` **[Object][3]** Piece of data to transform before going to harmonized storage. + +Returns **Project** JSON matching the type fields. + +### getBudget + +Preprocess `budget`. + +Input fields taken from the `record` are: + +- `Aid element £` + +#### Parameters + +- `record` **[Object][3]** The row received from parsed file + +Returns **Budget** + +### getDescription + +Preprocess `description`. + +Input fields taken from the `record` are: + +- `Beneficiary identifier (E-claims ref)` +- `Sector NACE group level` +- `SANI reference of the aid measure` +- `Objective of the aid` + +#### Parameters + +- `record` **[Object][3]** The row received from parsed file + +Returns **[String][4]** + +### getProjectId + +Preprocess `project_id`. + +Input fields taken from the `record` are: + +- `Objective of the aid` + +#### Parameters + +- `record` **[Object][3]** The row received from parsed file + +Returns **[String][4]** + +### getLocations + +Preprocess `project_locations`. + +Input fields taken from the `record` are: + +- `Location of Benficiary NUTS level II (drop down)` + +#### Parameters + +- `record` **[Object][3]** The row received from parsed file + +Returns **[Array][5]<[Location][6]>** + +### getThirdParties + +Preprocess `third_parties`. + +Input fields taken from the `record` are: + +- `Name of beneficiary` + +#### Parameters + +- `record` **[Object][3]** The row received from parsed file + +Returns **[Array][5]<ThirdParty>** + +### getTimeframe + +Preprocess `timeframe`. + +Input fields taken from the `record` are: + +- `Date of granting` + +#### Parameters + +- `record` **[Object][3]** The row received from parsed file + +Returns **Timeframe** + +[1]: https://github.com/ec-europa/eubfr-data-lake/blob/master/services/ingestion/etl/2014uk16rfop001/ods/test/stubs/ESF/record.json +[2]: https://github.com/ec-europa/eubfr-data-lake/blob/master/services/ingestion/etl/2014uk16rfop001/ods/src/lib/transform/ESF/transform.js +[3]: https://developer.mozilla.org/docs/Web/JavaScript/Reference/Global_Objects/Object +[4]: https://developer.mozilla.org/docs/Web/JavaScript/Reference/Global_Objects/String +[5]: https://developer.mozilla.org/docs/Web/JavaScript/Reference/Global_Objects/Array +[6]: https://developer.mozilla.org/docs/Web/API/Location diff --git a/docs/types/etls/2014uk16rfop001-xls.md b/docs/types/etls/2014uk16rfop001-xls.md new file mode 100644 index 000000000..0350c996b --- /dev/null +++ b/docs/types/etls/2014uk16rfop001-xls.md @@ -0,0 +1,532 @@ + + +## 2014uk16rfop001XlsTransform + +Map fields for 2014uk16rfop001 producer, XLS file types, ESF funding type. + +Example input data: [stub][1] + +Transform function: [implementation details][2] + +### Parameters + +- `record` **[Object][3]** Piece of data to transform before going to harmonized storage. + +Returns **Project** JSON matching the type fields. + +### getBudget + +Preprocess `budget`. + +Input fields taken from the `record` are: + +- `Total Eligible Expenditure Allocated to the Operation;Current` +- `Union co‑financing rate, as per priority axis;` + +#### Parameters + +- `record` **[Object][3]** The row received from parsed file + +Returns **Budget** + +### getBudget + +Preprocess `budget`. + +Input fields taken from the `record` are: + +- `Total project costs £m` +- `ERDF/ESF investment £m` + +#### Parameters + +- `record` **[Object][3]** The row received from parsed file + +Returns **Budget** + +### getDescription + +Preprocess `description`. + +Input fields taken from the `record` are: + +- `Operation Summary` + +#### Parameters + +- `record` **[Object][3]** The row received from parsed file + +Returns **[String][4]** + +### getDescription + +Preprocess `description`. + +Input fields taken from the `record` are: + +- `Type of fund` +- `Priority Axis` +- `Summary of project (max 100 words)` + +#### Parameters + +- `record` **[Object][3]** The row received from parsed file + +Returns **[String][4]** + +### getProjectId + +Preprocess `project_id`. + +There are rows with overlapping information about beneficiaries and operations. +In order to keep them separate, as they are in the ingested file, we take into account the budgetary information as well. + +Input fields taken from the `record` are: + +- `Operation Name` +- `Total Eligible Expenditure Allocated to the Operation;Current` + +#### Parameters + +- `record` **[Object][3]** The row received from parsed file + +Returns **[String][4]** + +### getProjectId + +Preprocess `project_id`. + +Input fields taken from the `record` are: + +- `Name of Project` + +#### Parameters + +- `record` **[Object][3]** The row received from parsed file + +Returns **[String][4]** + +### getLocations + +Preprocess `project_locations`. + +Input fields taken from the `record` are: + +- `Operation postcode; or other appropriate location indicator;` +- `Country` + +#### Parameters + +- `record` **[Object][3]** The row received from parsed file + +Returns **[Array][5]<[Location][6]>** + +### getLocations + +Preprocess `project_locations`. + +Input fields taken from the `record` are: + +- `Location (postcode)` +- `Local Enterprise Partnership area` +- `Country` + +#### Parameters + +- `record` **[Object][3]** The row received from parsed file + +Returns **[Array][5]<[Location][6]>** + +### getThemes + +Preprocess `themes`. + +Input fields taken from the `record` are: + +- `Category of Intervention` + +#### Parameters + +- `record` **[Object][3]** The row received from parsed file + +Returns **[Array][5]<[String][4]>** + +### getThemes + +Preprocess `themes`. + +Input fields taken from the `record` are: + +- `Type and focus of support (*Category of intervention)*` + +#### Parameters + +- `record` **[Object][3]** The row received from parsed file + +Returns **[Array][5]<[String][4]>** + +### getThirdParties + +Preprocess `third_parties`. + +Input fields taken from the `record` are: + +- `Beneficiary Name` + +#### Parameters + +- `record` **[Object][3]** The row received from parsed file + +Returns **[Array][5]<ThirdParty>** + +### getThirdParties + +Preprocess `third_parties`. + +Input fields taken from the `record` are: + +- `Recipient of funds` + +#### Parameters + +- `record` **[Object][3]** The row received from parsed file + +Returns **[Array][5]<ThirdParty>** + +### getTimeframe + +Preprocess `timeframe`. + +Input fields taken from the `record` are: + +- `Operation Start Date` +- `Operation End Date` + +#### Parameters + +- `record` **[Object][3]** The row received from parsed file + +Returns **Timeframe** + +### getTimeframe + +Preprocess `timeframe`. + +Input fields taken from the `record` are: + +- `Start date` +- `End date` + +#### Parameters + +- `record` **[Object][3]** The row received from parsed file + +Returns **Timeframe** + +### getTitle + +Preprocess `title`. + +Input fields taken from the `record` are: + +- `Operation Name` + +#### Parameters + +- `record` **[Object][3]** The row received from parsed file + +Returns **[String][4]** + +### getTitle + +Preprocess `title`. + +Input fields taken from the `record` are: + +- `Name of Project` + +#### Parameters + +- `record` **[Object][3]** The row received from parsed file + +Returns **[String][4]** + +### getCodeByCountry + +Gets country code from a country name. + +#### Parameters + +- `countryName` **[String][4]** The name of the country + +Returns **[String][4]** The ISO 3166-1 country code + +## 2014uk16rfop001XlsTransform + +Map fields for 2014uk16rfop001 producer, XLS file types, ESIF funding type. + +Example input data: [stub][7] + +Transform function: [implementation details][8] + +### Parameters + +- `record` **[Object][3]** Piece of data to transform before going to harmonized storage. + +Returns **Project** JSON matching the type fields. + +### getBudget + +Preprocess `budget`. + +Input fields taken from the `record` are: + +- `Total Eligible Expenditure Allocated to the Operation;Current` +- `Union co‑financing rate, as per priority axis;` + +#### Parameters + +- `record` **[Object][3]** The row received from parsed file + +Returns **Budget** + +### getBudget + +Preprocess `budget`. + +Input fields taken from the `record` are: + +- `Total project costs £m` +- `ERDF/ESF investment £m` + +#### Parameters + +- `record` **[Object][3]** The row received from parsed file + +Returns **Budget** + +### getDescription + +Preprocess `description`. + +Input fields taken from the `record` are: + +- `Operation Summary` + +#### Parameters + +- `record` **[Object][3]** The row received from parsed file + +Returns **[String][4]** + +### getDescription + +Preprocess `description`. + +Input fields taken from the `record` are: + +- `Type of fund` +- `Priority Axis` +- `Summary of project (max 100 words)` + +#### Parameters + +- `record` **[Object][3]** The row received from parsed file + +Returns **[String][4]** + +### getProjectId + +Preprocess `project_id`. + +There are rows with overlapping information about beneficiaries and operations. +In order to keep them separate, as they are in the ingested file, we take into account the budgetary information as well. + +Input fields taken from the `record` are: + +- `Operation Name` +- `Total Eligible Expenditure Allocated to the Operation;Current` + +#### Parameters + +- `record` **[Object][3]** The row received from parsed file + +Returns **[String][4]** + +### getProjectId + +Preprocess `project_id`. + +Input fields taken from the `record` are: + +- `Name of Project` + +#### Parameters + +- `record` **[Object][3]** The row received from parsed file + +Returns **[String][4]** + +### getLocations + +Preprocess `project_locations`. + +Input fields taken from the `record` are: + +- `Operation postcode; or other appropriate location indicator;` +- `Country` + +#### Parameters + +- `record` **[Object][3]** The row received from parsed file + +Returns **[Array][5]<[Location][6]>** + +### getLocations + +Preprocess `project_locations`. + +Input fields taken from the `record` are: + +- `Location (postcode)` +- `Local Enterprise Partnership area` +- `Country` + +#### Parameters + +- `record` **[Object][3]** The row received from parsed file + +Returns **[Array][5]<[Location][6]>** + +### getThemes + +Preprocess `themes`. + +Input fields taken from the `record` are: + +- `Category of Intervention` + +#### Parameters + +- `record` **[Object][3]** The row received from parsed file + +Returns **[Array][5]<[String][4]>** + +### getThemes + +Preprocess `themes`. + +Input fields taken from the `record` are: + +- `Type and focus of support (*Category of intervention)*` + +#### Parameters + +- `record` **[Object][3]** The row received from parsed file + +Returns **[Array][5]<[String][4]>** + +### getThirdParties + +Preprocess `third_parties`. + +Input fields taken from the `record` are: + +- `Beneficiary Name` + +#### Parameters + +- `record` **[Object][3]** The row received from parsed file + +Returns **[Array][5]<ThirdParty>** + +### getThirdParties + +Preprocess `third_parties`. + +Input fields taken from the `record` are: + +- `Recipient of funds` + +#### Parameters + +- `record` **[Object][3]** The row received from parsed file + +Returns **[Array][5]<ThirdParty>** + +### getTimeframe + +Preprocess `timeframe`. + +Input fields taken from the `record` are: + +- `Operation Start Date` +- `Operation End Date` + +#### Parameters + +- `record` **[Object][3]** The row received from parsed file + +Returns **Timeframe** + +### getTimeframe + +Preprocess `timeframe`. + +Input fields taken from the `record` are: + +- `Start date` +- `End date` + +#### Parameters + +- `record` **[Object][3]** The row received from parsed file + +Returns **Timeframe** + +### getTitle + +Preprocess `title`. + +Input fields taken from the `record` are: + +- `Operation Name` + +#### Parameters + +- `record` **[Object][3]** The row received from parsed file + +Returns **[String][4]** + +### getTitle + +Preprocess `title`. + +Input fields taken from the `record` are: + +- `Name of Project` + +#### Parameters + +- `record` **[Object][3]** The row received from parsed file + +Returns **[String][4]** + +### getCodeByCountry + +Gets country code from a country name. + +#### Parameters + +- `countryName` **[String][4]** The name of the country + +Returns **[String][4]** The ISO 3166-1 country code + +[1]: https://github.com/ec-europa/eubfr-data-lake/blob/master/services/ingestion/etl/2014uk16rfop001/xls/test/stubs/ESF/record.json +[2]: https://github.com/ec-europa/eubfr-data-lake/blob/master/services/ingestion/etl/2014uk16rfop001/xls/src/lib/transform/ESF/transform.js +[3]: https://developer.mozilla.org/docs/Web/JavaScript/Reference/Global_Objects/Object +[4]: https://developer.mozilla.org/docs/Web/JavaScript/Reference/Global_Objects/String +[5]: https://developer.mozilla.org/docs/Web/JavaScript/Reference/Global_Objects/Array +[6]: https://developer.mozilla.org/docs/Web/API/Location +[7]: https://github.com/ec-europa/eubfr-data-lake/blob/master/services/ingestion/etl/2014uk16rfop001/xls/test/stubs/ESIF/record.json +[8]: https://github.com/ec-europa/eubfr-data-lake/blob/master/services/ingestion/etl/2014uk16rfop001/xls/src/lib/transform/ESIF/transform.js diff --git a/scripts/documentation/docs-md.js b/scripts/documentation/docs-md.js index fea1537ad..29f579865 100755 --- a/scripts/documentation/docs-md.js +++ b/scripts/documentation/docs-md.js @@ -21,6 +21,9 @@ const transforms = [ '2014tc16rfcb047-xls', '2014tc16rfpc001-xls', '2014tc16rftn002-xls', + '2014uk16rfop001-xls', + '2014uk16rfop001-csv', + '2014uk16rfop001-ods', 'bulgaria-xls', 'cordis-csv', 'devco-xls', @@ -37,10 +40,10 @@ const transforms = [ ]; transforms.forEach(transform => { - const etl = transform.split('-'); + const [name, format] = transform.split('-'); documentation - .build(`**/etl/${etl[0]}/${etl[1]}/**/transform.js`, {}) + .build(`**/etl/${name}/${format}/**/transform.js`, {}) .then(documentation.formats.md) .then(output => { fs.writeFileSync( diff --git a/services/ingestion/etl/2014uk16rfop001/csv/README.md b/services/ingestion/etl/2014uk16rfop001/csv/README.md new file mode 100644 index 000000000..2f3b94241 --- /dev/null +++ b/services/ingestion/etl/2014uk16rfop001/csv/README.md @@ -0,0 +1,19 @@ +# 2014uk16rfop001 CSV ETL mapping rules + +Model to compare with is available at: https://ec-europa.github.io/eubfr-data-lake/ + +| Field | Target | +| ---------------------------------------------------- | ----------------- | +| Recipient of funds(ERDF/ESF beneficiary) | third_parties | +| Name of project | title | +| Type of fund | type | +| Summary of project(max 100 words) | description | +| Start date | timeframe.from | +| End date | timeframe.to | +| ERDF/ESF investment �m | | +| Total project costs �m (eligible project costs only) | budget.total_cost | +| % of project funded by EU (Co-financing rate%) | budget.eu_contrib | +| Location (postcode) | project_locations | +| Local enterprise partnership area | description | +| Country | | +| Type and focus support (category of intervention) | themes | diff --git a/services/ingestion/etl/2014uk16rfop001/csv/babel.config.js b/services/ingestion/etl/2014uk16rfop001/csv/babel.config.js new file mode 100644 index 000000000..0397ff2b1 --- /dev/null +++ b/services/ingestion/etl/2014uk16rfop001/csv/babel.config.js @@ -0,0 +1,29 @@ +module.exports = { + presets: [ + '@babel/preset-flow', + [ + '@babel/preset-env', + { + targets: { + node: '8.10', + }, + modules: false, + loose: true, + }, + ], + ], + env: { + test: { + presets: [ + [ + '@babel/preset-env', + { + targets: { + node: '8.10', + }, + }, + ], + ], + }, + }, +}; diff --git a/services/ingestion/etl/2014uk16rfop001/csv/package.json b/services/ingestion/etl/2014uk16rfop001/csv/package.json new file mode 100644 index 000000000..b842dcb29 --- /dev/null +++ b/services/ingestion/etl/2014uk16rfop001/csv/package.json @@ -0,0 +1,33 @@ +{ + "private": true, + "name": "@eubfr/ingestion-etl-2014uk16rfop001-csv", + "version": "0.6.0", + "scripts": { + "deploy": "sls deploy -v", + "test:unit": "jest --testPathPattern=unit" + }, + "dependencies": { + "@eubfr/lib": "^0.7.0", + "@eubfr/logger-messenger": "^0.7.0", + "csv-parse": "4.3.4", + "numeral": "2.0.6" + }, + "devDependencies": { + "@babel/core": "7.4.3", + "@babel/preset-env": "7.4.3", + "@babel/preset-flow": "7.0.0", + "@eubfr/types": "^0.7.0", + "aws-sdk": "2.434.0", + "babel-jest": "24.7.0", + "babel-loader": "8.0.5", + "jest": "24.7.0", + "serverless": "1.40.0", + "serverless-webpack": "5.2.0", + "webpack": "4.29.6" + }, + "jest": { + "transform": { + "^.+\\.js$": "babel-jest" + } + } +} diff --git a/services/ingestion/etl/2014uk16rfop001/csv/serverless.yml b/services/ingestion/etl/2014uk16rfop001/csv/serverless.yml new file mode 100644 index 000000000..258774be9 --- /dev/null +++ b/services/ingestion/etl/2014uk16rfop001/csv/serverless.yml @@ -0,0 +1,107 @@ +service: ingestion-etl-2014uk16rfop001-csv + +plugins: + - serverless-webpack + +custom: + webpack: + webpackConfig: ./webpack.config.js + includeModules: + forceExclude: + - aws-sdk + packager: yarn + eubfrEnvironment: ${opt:eubfr_env, file(../../../../../config.json):eubfr_env, env:EUBFR_ENV, 'dev'} + bucketName: ${file(../../../../../resources/harmonized-storage/serverless.yml):custom.bucketName} + +package: + individually: true + +provider: + name: aws + runtime: nodejs8.10 + timeout: 180 + stage: ${opt:stage, file(../../../../../config.json):stage, env:EUBFR_STAGE, 'dev'} + region: ${opt:region, file(../../../../../config.json):region, env:EUBFR_AWS_REGION, 'eu-central-1'} + deploymentBucket: + name: eubfr-${self:custom.eubfrEnvironment}-deploy + stackTags: + ENV: ${self:custom.eubfrEnvironment} + iamRoleStatements: + - Effect: 'Allow' + Action: + - 's3:PutObject' + Resource: + Fn::Join: + - '' + - - 'arn:aws:s3:::' + - ${self:custom.bucketName} + - '/*' + # Allow queueing messages to the DLQ https://docs.aws.amazon.com/lambda/latest/dg/dlq.html + - Effect: 'Allow' + Action: + - sqs:SendMessage + Resource: '*' + +functions: + parseCsv: + handler: src/events/onParseCSV.handler + name: ${self:provider.stage}-${self:service}-parseCsv + memorySize: 512 + environment: + BUCKET: ${self:custom.bucketName} + REGION: ${self:provider.region} + STAGE: ${self:provider.stage} + events: + - sns: + arn: + Fn::Join: + - '' + - - 'arn:aws:sns:' + - Ref: 'AWS::Region' + - ':' + - Ref: 'AWS::AccountId' + - ':${self:provider.stage}-etl-2014uk16rfop001-csv' + topicName: ${self:provider.stage}-etl-2014uk16rfop001-csv + +resources: + Resources: + ParseCsvLambdaFunction: + Type: 'AWS::Lambda::Function' + Properties: + DeadLetterConfig: + TargetArn: + Fn::ImportValue: ${self:provider.stage}:ingestion-dead-letter-queue:LambdaFailureQueue + SNSTopic2014uk16rfop001CSV: + Type: AWS::SNS::Topic + Properties: + TopicName: ${self:provider.stage}-etl-2014uk16rfop001-csv + DisplayName: 2014uk16rfop001 CSV ETL + SNSTopic2014uk16rfop001CSVPolicy: + Type: AWS::SNS::TopicPolicy + Properties: + PolicyDocument: + Version: '2012-10-17' + Statement: + - Sid: Allow-IngestionManager-Publish + Action: + - sns:Publish + Effect: Allow + Resource: + Fn::Join: + - '' + - - 'arn:aws:sns:' + - Ref: 'AWS::Region' + - ':' + - Ref: 'AWS::AccountId' + - ':${self:provider.stage}-etl-2014uk16rfop001-csv' + Principal: + AWS: + Fn::Join: + - '' + - - 'arn:aws:sts::' + - Ref: 'AWS::AccountId' + - ':assumed-role/ingestion-manager-${self:provider.stage}-' + - Ref: 'AWS::Region' + - '-lambdaRole/${self:provider.stage}-ingestion-manager-onObjectCreated' + Topics: + - Ref: SNSTopic2014uk16rfop001CSV diff --git a/services/ingestion/etl/2014uk16rfop001/csv/src/events/onParseCSV.js b/services/ingestion/etl/2014uk16rfop001/csv/src/events/onParseCSV.js new file mode 100644 index 000000000..d1644586c --- /dev/null +++ b/services/ingestion/etl/2014uk16rfop001/csv/src/events/onParseCSV.js @@ -0,0 +1,139 @@ +import AWS from 'aws-sdk'; // eslint-disable-line import/no-extraneous-dependencies +import parse from 'csv-parse/lib/sync'; + +// ETL utilities. +import ensureExtensions from '@eubfr/lib/etl/ensureExtensions'; +import extractMessage from '@eubfr/lib/etl/extractMessage'; +import handleError from '@eubfr/lib/etl/handleError'; + +import MessengerFactory from '@eubfr/logger-messenger/src/lib/MessengerFactory'; +import { STATUS } from '@eubfr/logger-messenger/src/lib/status'; + +import transformRecord from '../lib/transform'; + +export const handler = async (event, context) => { + const { BUCKET, REGION, STAGE } = process.env; + + if (!BUCKET || !REGION || !STAGE) { + throw new Error( + 'BUCKET, REGION and STAGE environment variables are required!' + ); + } + + const snsMessage = extractMessage(event); + const { key } = snsMessage.object; + + if (!ensureExtensions({ file: key, extensions: ['.csv'] })) { + throw new Error('CSV file expected for this ETL.'); + } + + const messenger = MessengerFactory.Create({ context }); + const s3 = new AWS.S3(); + + try { + await messenger.send({ + message: { + computed_key: key, + status_message: 'Start parsing CSV...', + status_code: STATUS.PARSING, + }, + to: ['logs'], + }); + + let projects = ''; + const recordsAgg = []; + const separator = ';'; + const mergeFields = [ + 'ERDF/ESF investment �m', + 'Location (postcode)', + '% of project funded by EU (Co-financing rate%)', + 'Total project costs �m (eligible project costs only)', + ]; + + const file = await s3 + .getObject({ Bucket: snsMessage.bucket.name, Key: key }) + .promise(); + + const csvData = file.Body.toString(); + const records = parse(csvData); + + // CSV file is not formatted well for this ETL. + // We need to selectively take information from it. + + records.shift(); // Title/Type of fund. + records.shift(); // Empty row + records.shift(); // Empty row + records.shift(); // Empty row + + const headerRow = records + .shift() + .map(el => el.trim().replace(/(\r\n|\n|\r)/gm, '')); + + records.pop(); // Notes + records.pop(); // Empty row + + // Normalize the list by replacing properties. + const mappedRecords = records.map(record => { + const mapped = {}; + + Object.keys(record).forEach((field, i) => { + mapped[headerRow[i]] = record[field]; + }); + + return mapped; + }); + + mappedRecords.forEach(record => { + const indexExisting = recordsAgg.findIndex( + needle => needle['Name of project'] === record['Name of project'] + ); + + // If not present yet. + if (indexExisting === -1) { + recordsAgg.push(record); + } + // If there's an existing object with that name, we have to update it. + else { + const existing = recordsAgg.splice(indexExisting, 1)[0]; + + mergeFields.forEach(field => { + // Concatenate old and current values for the given field. + existing[field] = `${existing[field]}${separator}${record[field]}`; + }); + + recordsAgg.push(existing); + } + }); + + recordsAgg.forEach(record => { + const data = transformRecord(record); + projects += `${JSON.stringify(data)}\n`; + }); + + // Upload the data to the harmonized storage bucket. + const params = { + Bucket: BUCKET, + Key: `${key}.ndjson`, + Body: projects, + ContentType: 'application/x-ndjson', + }; + + await s3.upload(params).promise(); + + await messenger.send({ + message: { + computed_key: key, + status_message: + 'CSV parsed successfully. Results will be uploaded to ElasticSearch soon...', + status_code: STATUS.PARSED, + }, + to: ['logs'], + }); + + return console.log('Done'); + } catch (error) { + return handleError({ messenger, key, statusCode: STATUS.ERROR }, { error }); + } +}; + +export default handler; diff --git a/services/ingestion/etl/2014uk16rfop001/csv/src/lib/transform.js b/services/ingestion/etl/2014uk16rfop001/csv/src/lib/transform.js new file mode 100644 index 000000000..fb5c50384 --- /dev/null +++ b/services/ingestion/etl/2014uk16rfop001/csv/src/lib/transform.js @@ -0,0 +1,318 @@ +// @flow + +import crypto from 'crypto'; +import numeral from 'numeral'; +import sanitizeBudgetItem from '@eubfr/lib/budget/budgetFormatter'; +import type { Project } from '@eubfr/types'; + +/* + * Transform message (2014uk16rfop001 CSV) + */ + +/** + * Preprocess `budget`. + * + * Input fields taken from the `record` are: + * + * - `Total project costs �m (eligible project costs only)` + * - `% of project funded by EU (Co-financing rate%)` + * + * @memberof 2014uk16rfop001CsvTransform + * @param {Object} record The row received from parsed file + * @returns {Budget} + */ + +const getBudget = record => { + const rates = record['% of project funded by EU (Co-financing rate%)'].split( + ';' + ); + const costs = record[ + 'Total project costs �m (eligible project costs only)' + ].split(';'); + + let rate = 0; + + rates.forEach(percentage => { + const { _value: percent } = numeral(percentage); + rate += percent; + }); + + let cost = 0; + + costs.forEach(costItem => { + const { _value: costValue } = numeral(costItem); + cost += costValue; + }); + + return { + eu_contrib: sanitizeBudgetItem({ + value: cost * rate, + currency: 'GBP', + raw: record['% of project funded by EU (Co-financing rate%)'], + }), + funding_area: [], + mmf_heading: '', + other_contrib: sanitizeBudgetItem(), + private_fund: sanitizeBudgetItem(), + public_fund: sanitizeBudgetItem(), + total_cost: sanitizeBudgetItem({ + value: cost, + currency: 'GBP', + raw: record['Total project costs �m (eligible project costs only)'], + }), + }; +}; + +/** + * Preprocess `description`. + * + * Input fields taken from the `record` are: + * + * - `Summary of project(max 100 words)` + * - `Local enterprise partnership area` + * + * @memberof 2014uk16rfop001CsvTransform + * @param {Object} record The row received from parsed file + * @returns {String} + */ + +const getDescription = record => { + const fields = [ + 'Summary of project(max 100 words)', + 'Local enterprise partnership area', + ]; + + let description = ''; + + fields.forEach(field => { + if (record[field]) { + description += `${field}: ${record[field]} \n`; + } + }); + + return description; +}; + +/** + * Generates an ID for `project_id`. + * + * Input fields taken from the `record` are: + * - `Name of project` + * + * @memberof 2014uk16rfop001CsvTransform + * @param {Object} record The row received from parsed file + * @returns {String} + */ + +const getProjectId = record => + crypto + .createHash('md5') + .update(record['Name of project']) + .digest('hex'); + +/** + * Preprocess `project_locations`. + * + * Input fields taken from the `record` are: + * + * - `Location (postcode)` + * + * @memberof 2014uk16rfop001CsvTransform + * @param {Object} record The row received from parsed file + * @returns {Array} + */ + +const getLocations = record => { + const locations = []; + + const regions = record['Location (postcode)'] + .split(';') + .filter(a => a) + .map(a => a.trim()); + + regions.forEach(region => { + locations.push({ + address: '', + centroid: null, + country_code: 'GB', + location: null, + nuts: [], + postal_code: '', + region, + town: '', + }); + }); + + return locations; +}; + +/** + * Preprocess `themes`. + * + * Input fields taken from the `record` are: + * + * - `Type and focus support (category of intervention)` + * + * @memberof 2014uk16rfop001CsvTransform + * @param {Object} record The row received from parsed file + * @returns {Array} + */ + +const getThemes = record => [ + record['Type and focus support (category of intervention)'].trim(), +]; + +/** + * Preprocess `third_parties`. + * + * Input fields taken from the `record` are: + * + * - `Recipient of funds(ERDF/ESF beneficiary)` + * + * @memberof 2014uk16rfop001CsvTransform + * @param {Object} record The row received from parsed file + * @returns {Array} + */ + +const getThirdParties = record => + record['Recipient of funds(ERDF/ESF beneficiary)'] + ? [ + { + address: '', + country: 'England', + email: '', + name: record['Recipient of funds(ERDF/ESF beneficiary)'].trim(), + phone: '', + region: '', + role: 'Beneficiary', + type: '', + website: '', + }, + ] + : []; + +/** + * Format date. + * + * @memberof 2014uk16rfop001CsvTransform + * @param {Date} date + * @returns {Date} The date formatted into an ISO 8601 date format + * + */ +const formatDate = date => { + if (!date || typeof date !== 'string') return null; + + // Since input format cannot be parsed by native Date() constructor directly: + const d = date.split('-'); + if (d.length !== 2) return null; + // We extract information about month and year manually. + const [m, y] = d; + if (!m || !y) return null; + // Get month natively from original date string for the UTC method. + const month = new Date(d).getMonth(); + + try { + return new Date(Date.UTC(Number(`20${y}`), month)).toISOString(); + } catch (e) { + return null; + } +}; + +/** + * Preprocess `timeframe`. + * + * Input fields taken from the `record` are: + * + * - `Start date` + * - `End date` + * + * @memberof 2014uk16rfop001CsvTransform + * @param {Object} record The row received from parsed file + * @returns {Timeframe} + */ + +const getTimeframe = record => { + const from = record['Start date'] || null; + const to = record['End date'] || null; + + return { + from: formatDate(from), + from_precision: 'month', + to: formatDate(to), + to_precision: 'month', + }; +}; + +/** + * Preprocess `title`. + * + * Input fields taken from the `record` are: + * + * - `Name of project` + * + * @memberof 2014uk16rfop001CsvTransform + * @param {Object} record The row received from parsed file + * @returns {String} + */ + +const getTitle = record => + record['Name of project'] ? record['Name of project'].trim() : ''; + +/** + * Preprocess `type`. + * + * Input fields taken from the `record` are: + * + * - `Type of fund` + * + * @memberof 2014uk16rfop001CsvTransform + * @param {Object} record The row received from parsed file + * @returns {Array} + */ + +const getType = record => + record['Type of fund'] ? [record['Type of fund'].trim()] : []; + +/** + * Map fields for 2014uk16rfop001 producer, CSV file types + * + * Example input data: {@link https://github.com/ec-europa/eubfr-data-lake/blob/master/services/ingestion/etl/2014uk16rfop001/csv/test/stubs/record.json|stub} + * + * Transform function: {@link https://github.com/ec-europa/eubfr-data-lake/blob/master/services/ingestion/etl/2014uk16rfop001/csv/src/lib/transform.js|implementation details} + * @name 2014uk16rfop001CsvTransform + * @param {Object} record Piece of data to transform before going to harmonized storage. + * @returns {Project} JSON matching the type fields. + */ + +export default (record: Object): Project | null => { + if (!record) return null; + + // Map the fields + return { + action: '', + budget: getBudget(record), + call_year: '', + description: getDescription(record), + ec_priorities: [], + media: [], + programme_name: '', + project_id: getProjectId(record), + project_locations: getLocations(record), + project_website: '', + complete: false, + related_links: [], + reporting_organisation: 'Member states', + results: { + available: '', + result: '', + }, + status: '', + sub_programme_name: '', + success_story: '', + themes: getThemes(record), + third_parties: getThirdParties(record), + timeframe: getTimeframe(record), + title: getTitle(record), + type: getType(record), + }; +}; diff --git a/services/ingestion/etl/2014uk16rfop001/csv/test/stubs/record.json b/services/ingestion/etl/2014uk16rfop001/csv/test/stubs/record.json new file mode 100644 index 000000000..78bf2e339 --- /dev/null +++ b/services/ingestion/etl/2014uk16rfop001/csv/test/stubs/record.json @@ -0,0 +1,15 @@ +{ + "Recipient of funds(ERDF/ESF beneficiary)": "NOMS CFO", + "Name of project": "CFO3", + "Type of fund": "ESF", + "Summary of project(max 100 words)": "NOMS CFO delivers services which prepare offenders to access mainstream employment, training and education. There is a focus on offenders with multiple barriers and those who fall into designated hard to reach categories. A strong focus on hard to reach groups remains at the centre of CFO delivery. Current mainstream provision is not accessible for many offenders with often limited benefits for those who have been excluded from such activity. NOMS CFO will give them the skills to engage with the mainstream. ", + "Start date": "Apr-15", + "End date": "Dec-20", + "ERDF/ESF investment �m": "�131,000,000;�8,478,280;�18,358,652;�16,267,309;�11,225,168;�11,656,581;�13,444,587;�13,591,200;�8,535,577;�13,722,646", + "Total project costs �m (eligible project costs only)": "�247,000,000;;;;;;;;;", + "% of project funded by EU (Co-financing rate%)": "53%;;;;;;;;;", + "Location (postcode)": "National;South West;North West;South East;East;London;East Midlands;West Midlands;North East;Yorkshire", + "Local enterprise partnership area": "National", + "Country": "England", + "Type and focus support (category of intervention)": "Social Inclusion" +} diff --git a/services/ingestion/etl/2014uk16rfop001/csv/test/unit/lib/__snapshots__/transform.spec.js.snap b/services/ingestion/etl/2014uk16rfop001/csv/test/unit/lib/__snapshots__/transform.spec.js.snap new file mode 100644 index 000000000..08ceb9cbe --- /dev/null +++ b/services/ingestion/etl/2014uk16rfop001/csv/test/unit/lib/__snapshots__/transform.spec.js.snap @@ -0,0 +1,183 @@ +// Jest Snapshot v1, https://goo.gl/fbAQLP + +exports[`2014uk16rfop001 CSV transformer Produces correct JSON output structure 1`] = ` +Object { + "action": "", + "budget": Object { + "eu_contrib": Object { + "currency": "GBP", + "raw": "53%;;;;;;;;;", + "value": 130910000, + }, + "funding_area": Array [], + "mmf_heading": "", + "other_contrib": Object { + "currency": "", + "raw": "", + "value": 0, + }, + "private_fund": Object { + "currency": "", + "raw": "", + "value": 0, + }, + "public_fund": Object { + "currency": "", + "raw": "", + "value": 0, + }, + "total_cost": Object { + "currency": "GBP", + "raw": "�247,000,000;;;;;;;;;", + "value": 247000000, + }, + }, + "call_year": "", + "complete": false, + "description": "Summary of project(max 100 words): NOMS CFO delivers services which prepare offenders to access mainstream employment, training and education. There is a focus on offenders with multiple barriers and those who fall into designated hard to reach categories. A strong focus on hard to reach groups remains at the centre of CFO delivery. Current mainstream provision is not accessible for many offenders with often limited benefits for those who have been excluded from such activity. NOMS CFO will give them the skills to engage with the mainstream. +Local enterprise partnership area: National +", + "ec_priorities": Array [], + "media": Array [], + "programme_name": "", + "project_id": "67a9d35a1b9d8c961b81138799089b78", + "project_locations": Array [ + Object { + "address": "", + "centroid": null, + "country_code": "GB", + "location": null, + "nuts": Array [], + "postal_code": "", + "region": "National", + "town": "", + }, + Object { + "address": "", + "centroid": null, + "country_code": "GB", + "location": null, + "nuts": Array [], + "postal_code": "", + "region": "South West", + "town": "", + }, + Object { + "address": "", + "centroid": null, + "country_code": "GB", + "location": null, + "nuts": Array [], + "postal_code": "", + "region": "North West", + "town": "", + }, + Object { + "address": "", + "centroid": null, + "country_code": "GB", + "location": null, + "nuts": Array [], + "postal_code": "", + "region": "South East", + "town": "", + }, + Object { + "address": "", + "centroid": null, + "country_code": "GB", + "location": null, + "nuts": Array [], + "postal_code": "", + "region": "East", + "town": "", + }, + Object { + "address": "", + "centroid": null, + "country_code": "GB", + "location": null, + "nuts": Array [], + "postal_code": "", + "region": "London", + "town": "", + }, + Object { + "address": "", + "centroid": null, + "country_code": "GB", + "location": null, + "nuts": Array [], + "postal_code": "", + "region": "East Midlands", + "town": "", + }, + Object { + "address": "", + "centroid": null, + "country_code": "GB", + "location": null, + "nuts": Array [], + "postal_code": "", + "region": "West Midlands", + "town": "", + }, + Object { + "address": "", + "centroid": null, + "country_code": "GB", + "location": null, + "nuts": Array [], + "postal_code": "", + "region": "North East", + "town": "", + }, + Object { + "address": "", + "centroid": null, + "country_code": "GB", + "location": null, + "nuts": Array [], + "postal_code": "", + "region": "Yorkshire", + "town": "", + }, + ], + "project_website": "", + "related_links": Array [], + "reporting_organisation": "Member states", + "results": Object { + "available": "", + "result": "", + }, + "status": "", + "sub_programme_name": "", + "success_story": "", + "themes": Array [ + "Social Inclusion", + ], + "third_parties": Array [ + Object { + "address": "", + "country": "England", + "email": "", + "name": "NOMS CFO", + "phone": "", + "region": "", + "role": "Beneficiary", + "type": "", + "website": "", + }, + ], + "timeframe": Object { + "from": "2015-04-01T00:00:00.000Z", + "from_precision": "month", + "to": "2020-12-01T00:00:00.000Z", + "to_precision": "month", + }, + "title": "CFO3", + "type": Array [ + "ESF", + ], +} +`; diff --git a/services/ingestion/etl/2014uk16rfop001/csv/test/unit/lib/transform.spec.js b/services/ingestion/etl/2014uk16rfop001/csv/test/unit/lib/transform.spec.js new file mode 100644 index 000000000..9522b17f0 --- /dev/null +++ b/services/ingestion/etl/2014uk16rfop001/csv/test/unit/lib/transform.spec.js @@ -0,0 +1,22 @@ +/** + * @jest-environment node + */ + +import mapper from '../../../src/lib/transform'; +import testRecord from '../../stubs/record.json'; + +describe('2014uk16rfop001 CSV transformer', () => { + let result = {}; + + beforeAll(() => { + result = mapper(testRecord); + }); + + test('Returns null when record is not provided', () => { + expect(mapper()).toBe(null); + }); + + test('Produces correct JSON output structure', () => { + expect(result).toMatchSnapshot(); + }); +}); diff --git a/services/ingestion/etl/2014uk16rfop001/csv/webpack.config.js b/services/ingestion/etl/2014uk16rfop001/csv/webpack.config.js new file mode 100644 index 000000000..30fd8ced7 --- /dev/null +++ b/services/ingestion/etl/2014uk16rfop001/csv/webpack.config.js @@ -0,0 +1,32 @@ +const slsw = require('serverless-webpack'); +const path = require('path'); + +module.exports = { + entry: slsw.lib.entries, + target: 'node', + mode: slsw.lib.webpack.isLocal ? 'development' : 'production', + optimization: { + minimize: process.env.EUBFR_ENV && process.env.EUBFR_ENV === 'prod', + }, + devtool: 'nosources-source-map', + externals: [{ 'aws-sdk': true }], + module: { + rules: [ + { + test: /\.js$/, + use: [ + { + loader: 'babel-loader', + }, + ], + include: __dirname, + exclude: /node_modules/, + }, + ], + }, + output: { + libraryTarget: 'commonjs2', + path: path.join(__dirname, '.webpack'), + filename: '[name].js', + }, +}; diff --git a/services/ingestion/etl/2014uk16rfop001/ods/README.md b/services/ingestion/etl/2014uk16rfop001/ods/README.md new file mode 100644 index 000000000..7b3a3ad3f --- /dev/null +++ b/services/ingestion/etl/2014uk16rfop001/ods/README.md @@ -0,0 +1,17 @@ +# 2014uk16rfop001 XLS ETL mapping rules + +Model to compare with is available at: https://ec-europa.github.io/eubfr-data-lake/ + +| Field | Target | +| ------------------------------------------------ | ----------------------- | +| Name of beneficiary | third_parties | +| Beneficiary identifier (E-claims ref) | description | +| Type of enterprise (drop down) | | +| Location of Benficiary NUTS level II (drop down) | project_locations | +| Sector NACE group level | description | +| Aid element £ | budget.total_cost | +| Aid instrument (drop down) | | +| Date of granting | timeframe.from | +| Objective of the aid | description, project_id | +| Granting authority | | +| SANI reference of the aid measure | description | diff --git a/services/ingestion/etl/2014uk16rfop001/ods/babel.config.js b/services/ingestion/etl/2014uk16rfop001/ods/babel.config.js new file mode 100644 index 000000000..0397ff2b1 --- /dev/null +++ b/services/ingestion/etl/2014uk16rfop001/ods/babel.config.js @@ -0,0 +1,29 @@ +module.exports = { + presets: [ + '@babel/preset-flow', + [ + '@babel/preset-env', + { + targets: { + node: '8.10', + }, + modules: false, + loose: true, + }, + ], + ], + env: { + test: { + presets: [ + [ + '@babel/preset-env', + { + targets: { + node: '8.10', + }, + }, + ], + ], + }, + }, +}; diff --git a/services/ingestion/etl/2014uk16rfop001/ods/package.json b/services/ingestion/etl/2014uk16rfop001/ods/package.json new file mode 100644 index 000000000..745e79654 --- /dev/null +++ b/services/ingestion/etl/2014uk16rfop001/ods/package.json @@ -0,0 +1,32 @@ +{ + "private": true, + "name": "@eubfr/ingestion-etl-2014uk16rfop001-ods", + "version": "0.6.0", + "scripts": { + "deploy": "sls deploy -v", + "test:unit": "jest --testPathPattern=unit" + }, + "dependencies": { + "@eubfr/lib": "^0.7.0", + "@eubfr/logger-messenger": "^0.7.0", + "xlsx": "0.14.2" + }, + "devDependencies": { + "@babel/core": "7.4.3", + "@babel/preset-env": "7.4.3", + "@babel/preset-flow": "7.0.0", + "@eubfr/types": "^0.7.0", + "aws-sdk": "2.434.0", + "babel-jest": "24.7.0", + "babel-loader": "8.0.5", + "jest": "24.7.0", + "serverless": "1.40.0", + "serverless-webpack": "5.2.0", + "webpack": "4.29.6" + }, + "jest": { + "transform": { + "^.+\\.js$": "babel-jest" + } + } +} diff --git a/services/ingestion/etl/2014uk16rfop001/ods/serverless.yml b/services/ingestion/etl/2014uk16rfop001/ods/serverless.yml new file mode 100644 index 000000000..c6ea9443f --- /dev/null +++ b/services/ingestion/etl/2014uk16rfop001/ods/serverless.yml @@ -0,0 +1,107 @@ +service: ingestion-etl-2014uk16rfop001-ods + +plugins: + - serverless-webpack + +custom: + webpack: + webpackConfig: ./webpack.config.js + includeModules: + forceExclude: + - aws-sdk + packager: yarn + eubfrEnvironment: ${opt:eubfr_env, file(../../../../../config.json):eubfr_env, env:EUBFR_ENV, 'dev'} + bucketName: ${file(../../../../../resources/harmonized-storage/serverless.yml):custom.bucketName} + +package: + individually: true + +provider: + name: aws + runtime: nodejs8.10 + timeout: 180 + stage: ${opt:stage, file(../../../../../config.json):stage, env:EUBFR_STAGE, 'dev'} + region: ${opt:region, file(../../../../../config.json):region, env:EUBFR_AWS_REGION, 'eu-central-1'} + deploymentBucket: + name: eubfr-${self:custom.eubfrEnvironment}-deploy + stackTags: + ENV: ${self:custom.eubfrEnvironment} + iamRoleStatements: + - Effect: 'Allow' + Action: + - 's3:PutObject' + Resource: + Fn::Join: + - '' + - - 'arn:aws:s3:::' + - ${self:custom.bucketName} + - '/*' + # Allow queueing messages to the DLQ https://docs.aws.amazon.com/lambda/latest/dg/dlq.html + - Effect: 'Allow' + Action: + - sqs:SendMessage + Resource: '*' + +functions: + parseOds: + handler: src/events/onParseODS.handler + name: ${self:provider.stage}-${self:service}-parseOds + memorySize: 1024 + environment: + BUCKET: ${self:custom.bucketName} + REGION: ${self:provider.region} + STAGE: ${self:provider.stage} + events: + - sns: + arn: + Fn::Join: + - '' + - - 'arn:aws:sns:' + - Ref: 'AWS::Region' + - ':' + - Ref: 'AWS::AccountId' + - ':${self:provider.stage}-etl-2014uk16rfop001-ods' + topicName: ${self:provider.stage}-etl-2014uk16rfop001-ods + +resources: + Resources: + ParseOdsLambdaFunction: + Type: 'AWS::Lambda::Function' + Properties: + DeadLetterConfig: + TargetArn: + Fn::ImportValue: ${self:provider.stage}:ingestion-dead-letter-queue:LambdaFailureQueue + SNSTopic2014uk16rfop001ODS: + Type: AWS::SNS::Topic + Properties: + TopicName: ${self:provider.stage}-etl-2014uk16rfop001-ods + DisplayName: 2014uk16rfop001 ODS ETL + SNSTopic2014uk16rfop001ODSPolicy: + Type: AWS::SNS::TopicPolicy + Properties: + PolicyDocument: + Version: '2012-10-17' + Statement: + - Sid: Allow-IngestionManager-Publish + Action: + - sns:Publish + Effect: Allow + Resource: + Fn::Join: + - '' + - - 'arn:aws:sns:' + - Ref: 'AWS::Region' + - ':' + - Ref: 'AWS::AccountId' + - ':${self:provider.stage}-etl-2014uk16rfop001-*' + Principal: + AWS: + Fn::Join: + - '' + - - 'arn:aws:sts::' + - Ref: 'AWS::AccountId' + - ':assumed-role/ingestion-manager-${self:provider.stage}-' + - Ref: 'AWS::Region' + - '-lambdaRole/${self:provider.stage}-ingestion-manager-onObjectCreated' + Topics: + - Ref: SNSTopic2014uk16rfop001ODS diff --git a/services/ingestion/etl/2014uk16rfop001/ods/src/events/onParseODS.js b/services/ingestion/etl/2014uk16rfop001/ods/src/events/onParseODS.js new file mode 100644 index 000000000..25bb2b929 --- /dev/null +++ b/services/ingestion/etl/2014uk16rfop001/ods/src/events/onParseODS.js @@ -0,0 +1,126 @@ +import AWS from 'aws-sdk'; // eslint-disable-line import/no-extraneous-dependencies +import XLSX from 'xlsx'; + +// ETL utilities. +import ensureExtensions from '@eubfr/lib/etl/ensureExtensions'; +import extractMessage from '@eubfr/lib/etl/extractMessage'; +import handleError from '@eubfr/lib/etl/handleError'; + +import MessengerFactory from '@eubfr/logger-messenger/src/lib/MessengerFactory'; +import { STATUS } from '@eubfr/logger-messenger/src/lib/status'; + +import transformRecord from '../lib/transform'; + +export const handler = async (event, context) => { + const { BUCKET, REGION, STAGE } = process.env; + + if (!BUCKET || !REGION || !STAGE) { + throw new Error( + 'BUCKET, REGION and STAGE environment variables are required!' + ); + } + + try { + const snsMessage = extractMessage(event); + const { key } = snsMessage.object; + + if (!ensureExtensions({ file: key, extensions: ['.ods'] })) { + throw new Error('ODS file expected for this ETL.'); + } + + const messenger = MessengerFactory.Create({ context }); + const s3 = new AWS.S3(); + + await messenger.send({ + message: { + computed_key: key, + status_message: 'Start parsing ODS...', + status_code: STATUS.PARSING, + }, + to: ['logs'], + }); + + // Get file + const readStream = s3 + .getObject({ Bucket: snsMessage.bucket.name, Key: key }) + .createReadStream(); + + return new Promise((resolve, reject) => { + // Put data in buffer + const buffers = []; + readStream.on('data', data => { + buffers.push(data); + }); + + readStream.on('error', async e => + handleError( + { messenger, key, statusCode: STATUS.ERROR }, + { error: e, callback: reject } + ) + ); + + // Manage data + readStream.on('end', async () => { + let dataString = ''; + + // Parse file + const buffer = Buffer.concat(buffers); + const workbook = XLSX.read(buffer, { + cellText: false, + cellDates: true, + }); + const sheetNameList = workbook.SheetNames; + const parsedRows = XLSX.utils.sheet_to_json( + workbook.Sheets[sheetNameList[0]] + ); + + parsedRows.shift(); + const columnsMap = parsedRows.shift(); + + const improvedData = parsedRows.map(row => { + const improvedRow = {}; + + Object.keys(row).forEach(columnKey => { + if (columnsMap[columnKey]) { + const columnName = columnsMap[columnKey].trim(); + improvedRow[columnName] = row[columnKey]; + } + }); + + return improvedRow; + }); + + improvedData.forEach(record => { + const data = transformRecord(record); + dataString += `${JSON.stringify(data)}\n`; + }); + + // Load data + const params = { + Bucket: BUCKET, + Key: `${key}.ndjson`, + Body: dataString, + ContentType: 'application/x-ndjson', + }; + + await s3.upload(params).promise(); + + await messenger.send({ + message: { + computed_key: key, + status_message: + 'ODS parsed successfully. Results will be uploaded to ElasticSearch soon...', + status_code: STATUS.PARSED, + }, + to: ['logs'], + }); + + return resolve('ODS parsed successfully'); + }); + }); + } catch (e) { + throw e; + } +}; + +export default handler; diff --git a/services/ingestion/etl/2014uk16rfop001/ods/src/lib/transform.js b/services/ingestion/etl/2014uk16rfop001/ods/src/lib/transform.js new file mode 100644 index 000000000..76c28d13e --- /dev/null +++ b/services/ingestion/etl/2014uk16rfop001/ods/src/lib/transform.js @@ -0,0 +1,209 @@ +// @flow + +import crypto from 'crypto'; +import type { Project } from '@eubfr/types'; +import sanitizeBudgetItem from '@eubfr/lib/budget/budgetFormatter'; + +/** + * Preprocess `budget`. + * + * Input fields taken from the `record` are: + * - `Aid element £` + * + * @memberof 2014uk16rfop001XlsTransform + * @param {Object} record The row received from parsed file + * @returns {Budget} + */ + +const getBudget = record => ({ + total_cost: sanitizeBudgetItem({ + value: record['Aid element £'], + currency: 'GBP', + raw: record['Aid element £'], + }), + eu_contrib: sanitizeBudgetItem(), + private_fund: sanitizeBudgetItem(), + public_fund: sanitizeBudgetItem(), + other_contrib: sanitizeBudgetItem(), + funding_area: [], + mmf_heading: '', +}); + +/** + * Preprocess `description`. + * + * Input fields taken from the `record` are: + * - `Beneficiary identifier (E-claims ref)` + * - `Sector NACE group level` + * - `SANI reference of the aid measure` + * - `Objective of the aid` + * + * @memberof 2014uk16rfop001XlsTransform + * @param {Object} record The row received from parsed file + * @returns {String} + */ + +const getDescription = record => { + let description = ''; + + const fields = [ + 'Beneficiary identifier (E-claims ref)', + 'Sector NACE group level', + 'SANI reference of the aid measure', + 'Objective of the aid', + ]; + + fields.forEach(descriptionField => { + description += `${descriptionField}: ${record[descriptionField]} \n`; + }); + + return description; +}; + +/** + * Preprocess `project_id`. + * + * Input fields taken from the `record` are: + * - `Objective of the aid` + * + * @memberof 2014uk16rfop001XlsTransform + * @param {Object} record The row received from parsed file + * @returns {String} + */ + +const getProjectId = record => + record['Objective of the aid'] + ? crypto + .createHash('md5') + .update(record['Objective of the aid']) + .digest('hex') + : ''; + +/** + * Preprocess `project_locations`. + * + * Input fields taken from the `record` are: + * - `Location of Benficiary NUTS level II (drop down)` + * + * @memberof 2014uk16rfop001XlsTransform + * @param {Object} record The row received from parsed file + * @returns {Array} + */ + +const getLocations = record => [ + { + address: '', + centroid: null, + country_code: 'GB', + location: null, + nuts: record['Location of Benficiary NUTS level II (drop down)'] + ? [ + { + code: record['Location of Benficiary NUTS level II (drop down)'], + name: '', + level: 2, + year: null, + }, + ] + : [], + postal_code: '', + region: '', + town: '', + }, +]; + +/** + * Preprocess `third_parties`. + * + * Input fields taken from the `record` are: + * - `Name of beneficiary` + * + * @memberof 2014uk16rfop001XlsTransform + * @param {Object} record The row received from parsed file + * @returns {Array} + */ + +const getThirdParties = record => + record['Name of beneficiary'] + ? [ + { + address: '', + country: 'GB', + email: '', + name: record['Name of beneficiary'] + ? record['Name of beneficiary'].trim() + : '', + phone: '', + region: '', + role: 'Beneficiary', + type: '', + website: '', + }, + ] + : []; + +/** + * Preprocess `timeframe`. + * + * Input fields taken from the `record` are: + * - `Date of granting` + * + * @memberof 2014uk16rfop001XlsTransform + * @param {Object} record The row received from parsed file + * @returns {Timeframe} + */ + +const getTimeframe = record => { + const from = record['Date of granting'] || null; + + return { + from, + from_precision: 'day', + to: null, + to_precision: 'day', + }; +}; + +/** + * Map fields for 2014uk16rfop001 producer, XLS file types + * + * Example input data: {@link https://github.com/ec-europa/eubfr-data-lake/blob/master/services/ingestion/etl/2014uk16rfop001/ods/test/stubs/ESF/record.json|stub} + * + * Transform function: {@link https://github.com/ec-europa/eubfr-data-lake/blob/master/services/ingestion/etl/2014uk16rfop001/ods/src/lib/transform/ESF/transform.js|implementation details} + * + * @name 2014uk16rfop001XlsTransform + * @param {Object} record Piece of data to transform before going to harmonized storage. + * @returns {Project} JSON matching the type fields. + */ +export default (record: Object): Project | null => { + if (!record) return null; + + // Map the fields + return { + action: '', + budget: getBudget(record), + call_year: '', + description: getDescription(record), + ec_priorities: [], + media: [], + programme_name: '', + project_id: getProjectId(record), + project_locations: getLocations(record), + project_website: '', + complete: false, + related_links: [], + reporting_organisation: 'Member states', + results: { + available: '', + result: '', + }, + status: '', + sub_programme_name: '', + success_story: '', + themes: [], + third_parties: getThirdParties(record), + timeframe: getTimeframe(record), + title: 'European Regional Development Fund State Aid', + type: [], + }; +}; diff --git a/services/ingestion/etl/2014uk16rfop001/ods/test/stubs/record.json b/services/ingestion/etl/2014uk16rfop001/ods/test/stubs/record.json new file mode 100644 index 000000000..2de0bcb00 --- /dev/null +++ b/services/ingestion/etl/2014uk16rfop001/ods/test/stubs/record.json @@ -0,0 +1,13 @@ +{ + "Name of beneficiary": "CETO Wave Energy UK Ltd", + "Beneficiary identifier (E-claims ref)": "05R16P00351", + "Type of enterprise (drop down)": "SME", + "Location of Benficiary NUTS level II (drop down)": "Cornwall and Isles of Scilly", + "Sector NACE group level": "M71.20 Technical testing and analysis", + "Aid element £": 3636525, + "Aid instrument (drop down)": "Grant/Interest rate subsidy", + "Date of granting": "2016-11-21T22:00:00.000Z", + "Objective of the aid": "Fundamental research (Art. 25(2)(a))", + "Granting authority": " DCLG", + "SANI reference of the aid measure": "SA 39161" +} diff --git a/services/ingestion/etl/2014uk16rfop001/ods/test/unit/events/onParseODS.spec.js b/services/ingestion/etl/2014uk16rfop001/ods/test/unit/events/onParseODS.spec.js new file mode 100644 index 000000000..81e55582c --- /dev/null +++ b/services/ingestion/etl/2014uk16rfop001/ods/test/unit/events/onParseODS.spec.js @@ -0,0 +1,20 @@ +/** + * @jest-environment node + */ + +import onParseODS from '../../../src/events/onParseODS'; + +describe(`Function onParseODS in "@eubfr/ingestion-etl-2014uk16rfop001-ods"`, () => { + test('The function requires BUCKET, REGION and STAGE environment variables', async () => { + const event = {}; + const context = {}; + + try { + await onParseODS(event, context); + } catch (error) { + expect(error.message).toEqual( + 'BUCKET, REGION and STAGE environment variables are required!' + ); + } + }); +}); diff --git a/services/ingestion/etl/2014uk16rfop001/ods/test/unit/lib/__snapshots__/transform.spec.js.snap b/services/ingestion/etl/2014uk16rfop001/ods/test/unit/lib/__snapshots__/transform.spec.js.snap new file mode 100644 index 000000000..49a2047b1 --- /dev/null +++ b/services/ingestion/etl/2014uk16rfop001/ods/test/unit/lib/__snapshots__/transform.spec.js.snap @@ -0,0 +1,98 @@ +// Jest Snapshot v1, https://goo.gl/fbAQLP + +exports[`2014uk16rfop001 ODS transformer Produces correct JSON output structure 1`] = ` +Object { + "action": "", + "budget": Object { + "eu_contrib": Object { + "currency": "", + "raw": "", + "value": 0, + }, + "funding_area": Array [], + "mmf_heading": "", + "other_contrib": Object { + "currency": "", + "raw": "", + "value": 0, + }, + "private_fund": Object { + "currency": "", + "raw": "", + "value": 0, + }, + "public_fund": Object { + "currency": "", + "raw": "", + "value": 0, + }, + "total_cost": Object { + "currency": "GBP", + "raw": 3636525, + "value": 3636525, + }, + }, + "call_year": "", + "complete": false, + "description": "Beneficiary identifier (E-claims ref): 05R16P00351 +Sector NACE group level: M71.20 Technical testing and analysis +SANI reference of the aid measure: SA 39161 +Objective of the aid: Fundamental research (Art. 25(2)(a)) +", + "ec_priorities": Array [], + "media": Array [], + "programme_name": "", + "project_id": "3d3de125a4aa564b78a2979649ed587a", + "project_locations": Array [ + Object { + "address": "", + "centroid": null, + "country_code": "GB", + "location": null, + "nuts": Array [ + Object { + "code": "Cornwall and Isles of Scilly", + "level": 2, + "name": "", + "year": null, + }, + ], + "postal_code": "", + "region": "", + "town": "", + }, + ], + "project_website": "", + "related_links": Array [], + "reporting_organisation": "Member states", + "results": Object { + "available": "", + "result": "", + }, + "status": "", + "sub_programme_name": "", + "success_story": "", + "themes": Array [], + "third_parties": Array [ + Object { + "address": "", + "country": "GB", + "email": "", + "name": "CETO Wave Energy UK Ltd", + "phone": "", + "region": "", + "role": "Beneficiary", + "type": "", + "website": "", + }, + ], + "timeframe": Object { + "from": "2016-11-21T22:00:00.000Z", + "from_precision": "day", + "to": null, + "to_precision": "day", + }, + "title": "European Regional Development Fund State Aid", + "type": Array [], +} +`; diff --git a/services/ingestion/etl/2014uk16rfop001/ods/test/unit/lib/transform.spec.js b/services/ingestion/etl/2014uk16rfop001/ods/test/unit/lib/transform.spec.js new file mode 100644 index 000000000..fd159aeb0 --- /dev/null +++ b/services/ingestion/etl/2014uk16rfop001/ods/test/unit/lib/transform.spec.js @@ -0,0 +1,22 @@ +/** + * @jest-environment node + */ + +import mapper from '../../../src/lib/transform'; +import testRecord from '../../stubs/record.json'; + +describe('2014uk16rfop001 ODS transformer', () => { + let result = {}; + + beforeAll(() => { + result = mapper(testRecord); + }); + + test('Returns null when record is not provided', () => { + expect(mapper()).toBe(null); + }); + + test('Produces correct JSON output structure', () => { + expect(result).toMatchSnapshot(); + }); +}); diff --git a/services/ingestion/etl/2014uk16rfop001/ods/webpack.config.js b/services/ingestion/etl/2014uk16rfop001/ods/webpack.config.js new file mode 100644 index 000000000..30fd8ced7 --- /dev/null +++ b/services/ingestion/etl/2014uk16rfop001/ods/webpack.config.js @@ -0,0 +1,32 @@ +const slsw = require('serverless-webpack'); +const path = require('path'); + +module.exports = { + entry: slsw.lib.entries, + target: 'node', + mode: slsw.lib.webpack.isLocal ? 'development' : 'production', + optimization: { + minimize: process.env.EUBFR_ENV && process.env.EUBFR_ENV === 'prod', + }, + devtool: 'nosources-source-map', + externals: [{ 'aws-sdk': true }], + module: { + rules: [ + { + test: /\.js$/, + use: [ + { + loader: 'babel-loader', + }, + ], + include: __dirname, + exclude: /node_modules/, + }, + ], + }, + output: { + libraryTarget: 'commonjs2', + path: path.join(__dirname, '.webpack'), + filename: '[name].js', + }, +}; diff --git a/services/ingestion/etl/2014uk16rfop001/xls/babel.config.js b/services/ingestion/etl/2014uk16rfop001/xls/babel.config.js new file mode 100644 index 000000000..0397ff2b1 --- /dev/null +++ b/services/ingestion/etl/2014uk16rfop001/xls/babel.config.js @@ -0,0 +1,29 @@ +module.exports = { + presets: [ + '@babel/preset-flow', + [ + '@babel/preset-env', + { + targets: { + node: '8.10', + }, + modules: false, + loose: true, + }, + ], + ], + env: { + test: { + presets: [ + [ + '@babel/preset-env', + { + targets: { + node: '8.10', + }, + }, + ], + ], + }, + }, +}; diff --git a/services/ingestion/etl/2014uk16rfop001/xls/package.json b/services/ingestion/etl/2014uk16rfop001/xls/package.json new file mode 100644 index 000000000..f89c97772 --- /dev/null +++ b/services/ingestion/etl/2014uk16rfop001/xls/package.json @@ -0,0 +1,33 @@ +{ + "private": true, + "name": "@eubfr/ingestion-etl-2014uk16rfop001-xls", + "version": "0.7.0", + "scripts": { + "deploy": "sls deploy -v", + "test:unit": "jest --testPathPattern=unit" + }, + "dependencies": { + "@eubfr/lib": "^0.7.0", + "@eubfr/logger-messenger": "^0.7.0", + "i18n-iso-countries": "3.7.8", + "xlsx": "0.14.2" + }, + "devDependencies": { + "@babel/core": "7.4.3", + "@babel/preset-env": "7.4.3", + "@babel/preset-flow": "7.0.0", + "@eubfr/types": "^0.7.0", + "aws-sdk": "2.434.0", + "babel-jest": "24.7.0", + "babel-loader": "8.0.5", + "jest": "24.7.0", + "serverless": "1.40.0", + "serverless-webpack": "5.2.0", + "webpack": "4.29.6" + }, + "jest": { + "transform": { + "^.+\\.js$": "babel-jest" + } + } +} diff --git a/services/ingestion/etl/2014uk16rfop001/xls/serverless.yml b/services/ingestion/etl/2014uk16rfop001/xls/serverless.yml new file mode 100644 index 000000000..cf8a981e1 --- /dev/null +++ b/services/ingestion/etl/2014uk16rfop001/xls/serverless.yml @@ -0,0 +1,123 @@ +service: ingestion-etl-2014uk16rfop001-xls + +plugins: + - serverless-webpack + +custom: + webpack: + webpackConfig: ./webpack.config.js + includeModules: + forceExclude: + - aws-sdk + packager: yarn + eubfrEnvironment: ${opt:eubfr_env, file(../../../../../config.json):eubfr_env, env:EUBFR_ENV, 'dev'} + bucketName: ${file(../../../../../resources/harmonized-storage/serverless.yml):custom.bucketName} + +package: + individually: true + +provider: + name: aws + runtime: nodejs8.10 + timeout: 60 + stage: ${opt:stage, file(../../../../../config.json):stage, env:EUBFR_STAGE, 'dev'} + region: ${opt:region, file(../../../../../config.json):region, env:EUBFR_AWS_REGION, 'eu-central-1'} + deploymentBucket: + name: eubfr-${self:custom.eubfrEnvironment}-deploy + stackTags: + ENV: ${self:custom.eubfrEnvironment} + iamRoleStatements: + - Effect: 'Allow' + Action: + - 's3:PutObject' + Resource: + Fn::Join: + - '' + - - 'arn:aws:s3:::' + - ${self:custom.bucketName} + - '/*' + # Allow queueing messages to the DLQ https://docs.aws.amazon.com/lambda/latest/dg/dlq.html + - Effect: 'Allow' + Action: + - sqs:SendMessage + Resource: '*' + +functions: + parseXls: + handler: src/events/onParseXLS.handler + name: ${self:provider.stage}-${self:service}-parseXls + memorySize: 1024 + environment: + BUCKET: ${self:custom.bucketName} + REGION: ${self:provider.region} + STAGE: ${self:provider.stage} + events: + - sns: + arn: + Fn::Join: + - '' + - - 'arn:aws:sns:' + - Ref: 'AWS::Region' + - ':' + - Ref: 'AWS::AccountId' + - ':${self:provider.stage}-etl-2014uk16rfop001-xls' + topicName: ${self:provider.stage}-etl-2014uk16rfop001-xls + - sns: + arn: + Fn::Join: + - '' + - - 'arn:aws:sns:' + - Ref: 'AWS::Region' + - ':' + - Ref: 'AWS::AccountId' + - ':${self:provider.stage}-etl-2014uk16rfop001-xlsx' + topicName: ${self:provider.stage}-etl-2014uk16rfop001-xlsx + +resources: + Resources: + ParseXlsLambdaFunction: + Type: 'AWS::Lambda::Function' + Properties: + DeadLetterConfig: + TargetArn: + Fn::ImportValue: ${self:provider.stage}:ingestion-dead-letter-queue:LambdaFailureQueue + SNSTopic2014uk16rfop001XLS: + Type: AWS::SNS::Topic + Properties: + TopicName: ${self:provider.stage}-etl-2014uk16rfop001-xls + DisplayName: 2014uk16rfop001 XLS ETL + SNSTopic2014uk16rfop001XLSX: + Type: AWS::SNS::Topic + Properties: + TopicName: ${self:provider.stage}-etl-2014uk16rfop001-xlsx + DisplayName: 2014uk16rfop001 XLSX ETL + SNSTopic2014uk16rfop001XLSPolicy: + Type: AWS::SNS::TopicPolicy + Properties: + PolicyDocument: + Version: '2012-10-17' + Statement: + - Sid: Allow-IngestionManager-Publish + Action: + - sns:Publish + Effect: Allow + Resource: + Fn::Join: + - '' + - - 'arn:aws:sns:' + - Ref: 'AWS::Region' + - ':' + - Ref: 'AWS::AccountId' + - ':${self:provider.stage}-etl-2014uk16rfop001-*' + Principal: + AWS: + Fn::Join: + - '' + - - 'arn:aws:sts::' + - Ref: 'AWS::AccountId' + - ':assumed-role/ingestion-manager-${self:provider.stage}-' + - Ref: 'AWS::Region' + - '-lambdaRole/${self:provider.stage}-ingestion-manager-onObjectCreated' + Topics: + - Ref: SNSTopic2014uk16rfop001XLS + - Ref: SNSTopic2014uk16rfop001XLSX diff --git a/services/ingestion/etl/2014uk16rfop001/xls/src/events/onParseXLS.js b/services/ingestion/etl/2014uk16rfop001/xls/src/events/onParseXLS.js new file mode 100644 index 000000000..64b5bd6c3 --- /dev/null +++ b/services/ingestion/etl/2014uk16rfop001/xls/src/events/onParseXLS.js @@ -0,0 +1,152 @@ +import AWS from 'aws-sdk'; // eslint-disable-line import/no-extraneous-dependencies +import XLSX from 'xlsx'; + +// ETL utilities. +import ensureExtensions from '@eubfr/lib/etl/ensureExtensions'; +import extractMessage from '@eubfr/lib/etl/extractMessage'; +import handleError from '@eubfr/lib/etl/handleError'; + +import MessengerFactory from '@eubfr/logger-messenger/src/lib/MessengerFactory'; +import { STATUS } from '@eubfr/logger-messenger/src/lib/status'; + +import getFundingType from '../lib/getFundingType'; +import getRecords from '../lib/getRecords'; +import getTransform from '../lib/transform/getTransform'; + +export const handler = async (event, context) => { + const { BUCKET, REGION, STAGE } = process.env; + + if (!BUCKET || !REGION || !STAGE) { + throw new Error( + 'BUCKET, REGION and STAGE environment variables are required!' + ); + } + + try { + const snsMessage = extractMessage(event); + const { key } = snsMessage.object; + + if (!ensureExtensions({ file: key, extensions: ['.xls', '.xlsx'] })) { + throw new Error('XLS or XLSX file expected for this ETL.'); + } + + const messenger = MessengerFactory.Create({ context }); + const s3 = new AWS.S3(); + + await messenger.send({ + message: { + computed_key: key, + status_message: 'Start parsing XLS...', + status_code: STATUS.PARSING, + }, + to: ['logs'], + }); + + // Get file + const readStream = s3 + .getObject({ Bucket: snsMessage.bucket.name, Key: key }) + .createReadStream(); + + return new Promise((resolve, reject) => { + // Put data in buffer + const buffers = []; + + readStream.on('data', data => { + buffers.push(data); + }); + + readStream.on('error', async e => + handleError( + { messenger, key, statusCode: STATUS.ERROR }, + { error: e, callback: reject } + ) + ); + + // Manage data + readStream.on('end', async () => { + let dataString = ''; + + // Parse file + const buffer = Buffer.concat(buffers); + const workbook = XLSX.read(buffer, { + cellText: false, + cellDates: true, + }); + const sheetNameList = workbook.SheetNames; + // Take into account only first sheet. + const sheet = workbook.Sheets[sheetNameList[0]]; + const rows = XLSX.utils.sheet_to_json(sheet); + + // The incoming XLS file could contain different types of information depending on funding type. + const type = getFundingType(rows); + + if (!type) { + const error = + 'Provided file does not contain a valid structure for giving information about ESF or ESIF types of funding!'; + + await handleError( + { messenger, key, statusCode: STATUS.ERROR }, + { + error, + callback: reject, + } + ); + + throw error; + } + + // Try to get the right transform corresponding function for this funding type. + const transform = getTransform(type); + + if (!transform) { + const error = `Couldn't find a transform function corresponding to ${type}`; + + await handleError( + { messenger, key, statusCode: STATUS.ERROR }, + { + error, + callback: reject, + } + ); + + throw error; + } + + // At this point, we have ensured that we can handle the incoming XLS file. + // So it's worth preparing the data for the transform function. + const records = getRecords({ rows, type }); + + records.forEach(record => { + const data = transform(record); + dataString += `${JSON.stringify(data)}\n`; + }); + + // Load data + const params = { + Bucket: BUCKET, + Key: `${key}.ndjson`, + Body: dataString, + ContentType: 'application/x-ndjson', + }; + + await s3.upload(params).promise(); + + await messenger.send({ + message: { + computed_key: key, + status_message: + 'XLS parsed successfully. Results will be uploaded to ElasticSearch soon...', + status_code: STATUS.PARSED, + }, + to: ['logs'], + }); + + return resolve('XLS parsed successfully'); + }); + }); + } catch (e) { + throw e; + } +}; + +export default handler; diff --git a/services/ingestion/etl/2014uk16rfop001/xls/src/lib/getFundingType.js b/services/ingestion/etl/2014uk16rfop001/xls/src/lib/getFundingType.js new file mode 100644 index 000000000..6a27551d7 --- /dev/null +++ b/services/ingestion/etl/2014uk16rfop001/xls/src/lib/getFundingType.js @@ -0,0 +1,37 @@ +import improveObjectKeys from './improveObjectKeys'; + +/** + * Because the contents of the incoming XLS file can vary, this utility will help figuring out which transform function to use. + * Only the first 2 items of the `sheet` array are used, as always they contain background information or leading header row which suffice to know the contents of the whole XLS file. + * + * @param {Array} sheet Contains result of XLSX.utils.sheet_to_json(sheet) + * @returns {String} The type of incoming XLS file. Could be an empty, 'ESF' or 'ESIF' + */ +const getFundingType = sheet => { + let type = ''; + const first = improveObjectKeys(sheet[0]); + const second = improveObjectKeys(sheet[1]); + + if ( + first['LIST OF OPERATIONSWYKAZ OPERACJI'] && + first['LIST OF OPERATIONSWYKAZ OPERACJI'] === 'Beneficiary Name' && + second['LIST OF OPERATIONSWYKAZ OPERACJI'] && + second['LIST OF OPERATIONSWYKAZ OPERACJI'] === 'Nazwa Odbiorcy' + ) { + type = 'ESF'; + } + + if ( + first.__EMPTY_1 && + first.__EMPTY_1 === + 'EUROPEAN STRUCTURAL AND INVESTMENT FUNDS LIST OF OPERATIONS 2014 TO 2020' && + second.__EMPTY && + second.__EMPTY === 'Last updated January 2019' + ) { + type = 'ESIF'; + } + + return type; +}; + +export default getFundingType; diff --git a/services/ingestion/etl/2014uk16rfop001/xls/src/lib/getRecords.js b/services/ingestion/etl/2014uk16rfop001/xls/src/lib/getRecords.js new file mode 100644 index 000000000..39de55594 --- /dev/null +++ b/services/ingestion/etl/2014uk16rfop001/xls/src/lib/getRecords.js @@ -0,0 +1,57 @@ +import improveObjectKeys from './improveObjectKeys'; + +/** + * Takes raw parsed rows from XLSX.utils.sheet_to_json(sheet) and returns well-formatted records for transform functions. + * + * @param {Object} Should contain `rows` ({Array}) and `type` ({String}) + * @returns {Array} Improved list of records. + */ +const getRecords = ({ rows, type }) => { + let headerRow = {}; + const records = []; + + switch (type) { + case 'ESF': { + // First row is the header. + headerRow = rows.shift(); + // Remove row with information in Polish. + rows.shift(); + + break; + } + + case 'ESIF': { + // The first few rows contain explanations. + rows.shift(); + rows.shift(); + + // English version of the columns. + headerRow = rows.shift(); + // French version + rows.shift(); + + break; + } + + default: + break; + } + + // Normalize the list by replacing properties + rows + .map(record => { + const remapped = {}; + + Object.keys(record).forEach(prop => { + remapped[headerRow[prop]] = record[prop]; + }); + + return remapped; + }) + .map(improveObjectKeys) + .forEach(record => records.push(record)); + + return records; +}; + +export default getRecords; diff --git a/services/ingestion/etl/2014uk16rfop001/xls/src/lib/improveObjectKeys.js b/services/ingestion/etl/2014uk16rfop001/xls/src/lib/improveObjectKeys.js new file mode 100644 index 000000000..5f4d4c25c --- /dev/null +++ b/services/ingestion/etl/2014uk16rfop001/xls/src/lib/improveObjectKeys.js @@ -0,0 +1,16 @@ +const improveObjectKeys = o => { + const newObject = {}; + + Object.keys(o).forEach(key => { + const newKey = key + .trim() + .replace(/(\r\n|\n|\r)/gm, '') + .replace(/ {1,}/g, ' '); + + newObject[newKey] = o[key]; + }); + + return newObject; +}; + +module.exports = improveObjectKeys; diff --git a/services/ingestion/etl/2014uk16rfop001/xls/src/lib/transform/ESF/README.md b/services/ingestion/etl/2014uk16rfop001/xls/src/lib/transform/ESF/README.md new file mode 100644 index 000000000..d9620f084 --- /dev/null +++ b/services/ingestion/etl/2014uk16rfop001/xls/src/lib/transform/ESF/README.md @@ -0,0 +1,20 @@ +# 2014uk16rfop001 XLS ETL mapping rules + +Funding type: ESF (European Social Fund) + +Model to compare with is available at: https://ec-europa.github.io/eubfr-data-lake/ + +| Field | Target | +| -------------------------------------------------------------- | ----------------- | +| Beneficiary Name | third_parties | +| Operation Name | title | +| Operation Summary | description | +| Operation Start Date | timeframe.from | +| Operation End Date | timeframe.to | +| Total Eligible Expenditure Allocated to the Operation;Original | | +| Total Eligible Expenditure Allocated to the Operation;Current | budget.total_cost | +| Union co‑financing rate, as per priority axis; | budget.eu_contrib | +| Operation postcode; or other appropriate location indicator; | project_locations | +| Country | project_locations | +| Category of Intervention | themes | +| Last updated | | diff --git a/services/ingestion/etl/2014uk16rfop001/xls/src/lib/transform/ESF/transform.js b/services/ingestion/etl/2014uk16rfop001/xls/src/lib/transform/ESF/transform.js new file mode 100644 index 000000000..40ec28499 --- /dev/null +++ b/services/ingestion/etl/2014uk16rfop001/xls/src/lib/transform/ESF/transform.js @@ -0,0 +1,268 @@ +// @flow + +import crypto from 'crypto'; +import type { Project } from '@eubfr/types'; +import getCountryCode from '@eubfr/lib/location/getCountryCode'; +import sanitizeBudgetItem from '@eubfr/lib/budget/budgetFormatter'; + +/** + * Preprocess `budget`. + * + * Input fields taken from the `record` are: + * - `Total Eligible Expenditure Allocated to the Operation;Current` + * - `Union co‑financing rate, as per priority axis;` + * + * @memberof 2014uk16rfop001XlsTransform + * @param {Object} record The row received from parsed file + * @returns {Budget} + */ + +const getBudget = record => ({ + total_cost: sanitizeBudgetItem({ + value: + record['Total Eligible Expenditure Allocated to the Operation;Current'], + currency: 'GBP', + raw: + record['Total Eligible Expenditure Allocated to the Operation;Current'], + }), + eu_contrib: sanitizeBudgetItem({ + value: + record['Total Eligible Expenditure Allocated to the Operation;Current'] * + record['Union co‑financing rate, as per priority axis;'], + currency: 'GBP', + raw: record['Union co‑financing rate, as per priority axis;'], + }), + private_fund: sanitizeBudgetItem(), + public_fund: sanitizeBudgetItem(), + other_contrib: sanitizeBudgetItem(), + funding_area: [], + mmf_heading: '', +}); + +/** + * Preprocess `description`. + * + * Input fields taken from the `record` are: + * - `Operation Summary` + * + * @memberof 2014uk16rfop001XlsTransform + * @param {Object} record The row received from parsed file + * @returns {String} + */ + +const getDescription = record => record['Operation Summary'] || ''; + +/** + * Preprocess `project_id`. + * + * There are rows with overlapping information about beneficiaries and operations. + * In order to keep them separate, as they are in the ingested file, we take into account the budgetary information as well. + * + * Input fields taken from the `record` are: + * - `Operation Name` + * - `Total Eligible Expenditure Allocated to the Operation;Current` + * + * @memberof 2014uk16rfop001XlsTransform + * @param {Object} record The row received from parsed file + * @returns {String} + */ + +const getProjectId = record => { + const infoCombined = + record['Operation Name'] + + String( + record['Total Eligible Expenditure Allocated to the Operation;Current'] + ); + + return crypto + .createHash('md5') + .update(infoCombined) + .digest('hex'); +}; + +/** + * Preprocess `project_locations`. + * + * Input fields taken from the `record` are: + * - `Operation postcode; or other appropriate location indicator;` + * - `Country` + * + * @memberof 2014uk16rfop001XlsTransform + * @param {Object} record The row received from parsed file + * @returns {Array} + */ + +const getLocations = record => { + let region = ''; + const locations = []; + const country = getCountryCode(record.Country); + + const location = record[ + 'Operation postcode; or other appropriate location indicator;' + ] + ? record[ + 'Operation postcode; or other appropriate location indicator;' + ].split('LEP')[0] + : ''; + + if (location) { + const places = location.trim().split(/\s*(?:,|&)\s*/); + + // Check if several places are included, take only the first one. + if (places.length && places.length > 1) { + region = places[0].trim(); + } + // Otherwise take whatever is before the 'LEP' clarification. + else { + region = location.trim(); + } + } + + if (country) { + locations.push({ + address: '', + centroid: null, + country_code: country, + location: null, + nuts: [], + postal_code: '', + region, + town: '', + }); + } + + return locations; +}; + +/** + * Preprocess `themes`. + * + * Input fields taken from the `record` are: + * - `Category of Intervention` + * + * @memberof 2014uk16rfop001XlsTransform + * @param {Object} record The row received from parsed file + * @returns {Array} + */ + +const getThemes = record => + record['Category of Intervention'] + ? [ + record['Category of Intervention'] + .trim() + .replace(/(\r\n|\n|\r)/gm, '') + .replace(/ {1,}/g, ' '), + ] + : []; + +/** + * Preprocess `third_parties`. + * + * Input fields taken from the `record` are: + * - `Beneficiary Name` + * + * @memberof 2014uk16rfop001XlsTransform + * @param {Object} record The row received from parsed file + * @returns {Array} + */ + +const getThirdParties = record => + record['Beneficiary Name'] + ? [ + { + address: '', + country: 'GB', + email: '', + name: record['Beneficiary Name'] + ? record['Beneficiary Name'].trim() + : '', + phone: '', + region: '', + role: 'Beneficiary', + type: '', + website: '', + }, + ] + : []; + +/** + * Preprocess `timeframe`. + * + * Input fields taken from the `record` are: + * - `Operation Start Date` + * - `Operation End Date` + * + * @memberof 2014uk16rfop001XlsTransform + * @param {Object} record The row received from parsed file + * @returns {Timeframe} + */ + +const getTimeframe = record => { + const from = record['Operation Start Date'] || null; + const to = record['Operation End Date'] || null; + + return { + from, + from_precision: 'day', + to, + to_precision: 'day', + }; +}; + +/** + * Preprocess `title`. + * + * Input fields taken from the `record` are: + * - `Operation Name` + * + * @memberof 2014uk16rfop001XlsTransform + * @param {Object} record The row received from parsed file + * @returns {String} + */ + +const getTitle = record => + record['Operation Name'] ? record['Operation Name'].trim() : ''; + +/** + * Map fields for 2014uk16rfop001 producer, XLS file types, ESF funding type. + * + * Example input data: {@link https://github.com/ec-europa/eubfr-data-lake/blob/master/services/ingestion/etl/2014uk16rfop001/xls/test/stubs/ESF/record.json|stub} + * + * Transform function: {@link https://github.com/ec-europa/eubfr-data-lake/blob/master/services/ingestion/etl/2014uk16rfop001/xls/src/lib/transform/ESF/transform.js|implementation details} + * + * @name 2014uk16rfop001XlsTransform + * @param {Object} record Piece of data to transform before going to harmonized storage. + * @returns {Project} JSON matching the type fields. + */ +export default (record: Object): Project | null => { + if (!record) return null; + + // Map the fields + return { + action: '', + budget: getBudget(record), + call_year: '', + description: getDescription(record), + ec_priorities: [], + media: [], + programme_name: '', + project_id: getProjectId(record), + project_locations: getLocations(record), + project_website: '', + complete: false, + related_links: [], + reporting_organisation: 'Member states', + results: { + available: '', + result: '', + }, + status: '', + sub_programme_name: '', + success_story: '', + themes: getThemes(record), + third_parties: getThirdParties(record), + timeframe: getTimeframe(record), + title: getTitle(record), + type: [], + }; +}; diff --git a/services/ingestion/etl/2014uk16rfop001/xls/src/lib/transform/ESIF/README.md b/services/ingestion/etl/2014uk16rfop001/xls/src/lib/transform/ESIF/README.md new file mode 100644 index 000000000..173518890 --- /dev/null +++ b/services/ingestion/etl/2014uk16rfop001/xls/src/lib/transform/ESIF/README.md @@ -0,0 +1,22 @@ +# 2014uk16rfop001 XLS ETL mapping rules + +Funding type: ESIF (EUROPEAN STRUCTURAL AND INVESTMENT FUNDS) + +Model to compare with is available at: https://ec-europa.github.io/eubfr-data-lake/ + +| Field | Target | +| ------------------------------------------------------ | ----------------- | +| Recipient of funds | third_parties | +| Name of Project | title | +| Type of fund | description | +| Priority Axis | description | +| Summary of project (max 100 words) | description | +| Start date | timeframe.from | +| End date | timeframe.to | +| ERDF/ESF investment £m | budget.eu_contrib | +| Total project costs £m | budget.total_cost | +| % of project funded by EU | | +| Location (postcode) | project_locations | +| Local Enterprise Partnership area | project_locations | +| Country | project_locations | +| Type and focus of support (_Category of intervention)_ | themes | diff --git a/services/ingestion/etl/2014uk16rfop001/xls/src/lib/transform/ESIF/transform.js b/services/ingestion/etl/2014uk16rfop001/xls/src/lib/transform/ESIF/transform.js new file mode 100644 index 000000000..e94c6ea9d --- /dev/null +++ b/services/ingestion/etl/2014uk16rfop001/xls/src/lib/transform/ESIF/transform.js @@ -0,0 +1,262 @@ +// @flow + +import crypto from 'crypto'; +import countries from 'i18n-iso-countries'; +import type { Project } from '@eubfr/types'; +import getCountryCode from '@eubfr/lib/location/getCountryCode'; +import sanitizeBudgetItem from '@eubfr/lib/budget/budgetFormatter'; + +/** + * Preprocess `budget`. + * + * Input fields taken from the `record` are: + * - `Total project costs £m` + * - `ERDF/ESF investment £m` + * + * @memberof 2014uk16rfop001XlsTransform + * @param {Object} record The row received from parsed file + * @returns {Budget} + */ + +const getBudget = record => ({ + total_cost: sanitizeBudgetItem({ + value: record['Total project costs £m'], + currency: 'GBP', + raw: record['Total project costs £m'], + }), + eu_contrib: sanitizeBudgetItem({ + value: record['ERDF/ESF investment £m'], + currency: 'GBP', + raw: record['ERDF/ESF investment £m'], + }), + private_fund: sanitizeBudgetItem(), + public_fund: sanitizeBudgetItem(), + other_contrib: sanitizeBudgetItem(), + funding_area: [], + mmf_heading: '', +}); + +/** + * Preprocess `description`. + * + * Input fields taken from the `record` are: + * - `Type of fund` + * - `Priority Axis` + * - `Summary of project (max 100 words)` + * + * @memberof 2014uk16rfop001XlsTransform + * @param {Object} record The row received from parsed file + * @returns {String} + */ + +const getDescription = record => { + let description = ''; + const fields = [ + 'Type of fund', + 'Priority Axis', + 'Summary of project (max 100 words)', + ]; + + fields.forEach(descriptionField => { + description += `${descriptionField}: ${record[descriptionField]} \n`; + }); + + return description; +}; + +/** + * Preprocess `project_id`. + * + * Input fields taken from the `record` are: + * - `Name of Project` + * + * @memberof 2014uk16rfop001XlsTransform + * @param {Object} record The row received from parsed file + * @returns {String} + */ + +const getProjectId = record => { + return crypto + .createHash('md5') + .update(record['Name of Project']) + .digest('hex'); +}; + +/** + * Gets country code from a country name. + * + * @memberof 2014uk16rfop001XlsTransform + * @param {String} countryName The name of the country + * @returns {String} The ISO 3166-1 country code + */ + +const getCodeByCountry = countryName => + countries.getAlpha2Code(countryName, 'en'); + +/** + * Preprocess `project_locations`. + * + * Input fields taken from the `record` are: + * - `Location (postcode)` + * - `Local Enterprise Partnership area` + * - `Country` + * + * @memberof 2014uk16rfop001XlsTransform + * @param {Object} record The row received from parsed file + * @returns {Array} + */ + +const getLocations = record => { + const locations = []; + const region = record['Local Enterprise Partnership area'] || ''; + const postCode = record['Location (postcode)'] || ''; + // We definitely expect England, UK or something similar, but if someone decides to include none of these, we respect it. + const countryByLib = getCountryCode(getCodeByCountry(record.Country)); + const country = countryByLib || 'GB'; + + locations.push({ + address: '', + centroid: null, + country_code: country, + location: null, + nuts: [], + postal_code: postCode, + region, + town: '', + }); + + return locations; +}; + +/** + * Preprocess `themes`. + * + * Input fields taken from the `record` are: + * - `Type and focus of support (*Category of intervention)*` + * + * @memberof 2014uk16rfop001XlsTransform + * @param {Object} record The row received from parsed file + * @returns {Array} + */ + +const getThemes = record => + record['Type and focus of support (*Category of intervention)*'] + ? [ + record['Type and focus of support (*Category of intervention)*'] + .trim() + .replace(/(\r\n|\n|\r)/gm, '') + .replace(/ {1,}/g, ' '), + ] + : []; + +/** + * Preprocess `third_parties`. + * + * Input fields taken from the `record` are: + * - `Recipient of funds` + * + * @memberof 2014uk16rfop001XlsTransform + * @param {Object} record The row received from parsed file + * @returns {Array} + */ + +const getThirdParties = record => + record['Recipient of funds'] + ? [ + { + address: '', + country: 'GB', + email: '', + name: record['Recipient of funds'] + ? record['Recipient of funds'].trim() + : '', + phone: '', + region: '', + role: 'Beneficiary', + type: '', + website: '', + }, + ] + : []; + +/** + * Preprocess `timeframe`. + * + * Input fields taken from the `record` are: + * - `Start date` + * - `End date` + * + * @memberof 2014uk16rfop001XlsTransform + * @param {Object} record The row received from parsed file + * @returns {Timeframe} + */ + +const getTimeframe = record => { + const from = record['Start date'] || null; + const to = record['End date'] || null; + + return { + from, + from_precision: 'day', + to, + to_precision: 'day', + }; +}; + +/** + * Preprocess `title`. + * + * Input fields taken from the `record` are: + * - `Name of Project` + * + * @memberof 2014uk16rfop001XlsTransform + * @param {Object} record The row received from parsed file + * @returns {String} + */ + +const getTitle = record => + record['Name of Project'] ? record['Name of Project'].trim() : ''; + +/** + * Map fields for 2014uk16rfop001 producer, XLS file types, ESIF funding type. + * + * Example input data: {@link https://github.com/ec-europa/eubfr-data-lake/blob/master/services/ingestion/etl/2014uk16rfop001/xls/test/stubs/ESIF/record.json|stub} + * + * Transform function: {@link https://github.com/ec-europa/eubfr-data-lake/blob/master/services/ingestion/etl/2014uk16rfop001/xls/src/lib/transform/ESIF/transform.js|implementation details} + * + * @name 2014uk16rfop001XlsTransform + * @param {Object} record Piece of data to transform before going to harmonized storage. + * @returns {Project} JSON matching the type fields. + */ +export default (record: Object): Project | null => { + if (!record) return null; + + // Map the fields + return { + action: '', + budget: getBudget(record), + call_year: '', + description: getDescription(record), + ec_priorities: [], + media: [], + programme_name: '', + project_id: getProjectId(record), + project_locations: getLocations(record), + project_website: '', + complete: false, + related_links: [], + reporting_organisation: 'Member states', + results: { + available: '', + result: '', + }, + status: '', + sub_programme_name: '', + success_story: '', + themes: getThemes(record), + third_parties: getThirdParties(record), + timeframe: getTimeframe(record), + title: getTitle(record), + type: [], + }; +}; diff --git a/services/ingestion/etl/2014uk16rfop001/xls/src/lib/transform/getTransform.js b/services/ingestion/etl/2014uk16rfop001/xls/src/lib/transform/getTransform.js new file mode 100644 index 000000000..401618584 --- /dev/null +++ b/services/ingestion/etl/2014uk16rfop001/xls/src/lib/transform/getTransform.js @@ -0,0 +1,23 @@ +import esf from './ESF/transform'; +import esif from './ESIF/transform'; + +const getTransform = type => { + let transform = null; + + switch (type) { + case 'ESF': + transform = esf; + break; + + case 'ESIF': + transform = esif; + break; + + default: + break; + } + + return transform; +}; + +export default getTransform; diff --git a/services/ingestion/etl/2014uk16rfop001/xls/test/stubs/ESF/record.json b/services/ingestion/etl/2014uk16rfop001/xls/test/stubs/ESF/record.json new file mode 100644 index 000000000..79a4660de --- /dev/null +++ b/services/ingestion/etl/2014uk16rfop001/xls/test/stubs/ESF/record.json @@ -0,0 +1,14 @@ +{ + "Beneficiary Name": "15BILLION", + "Operation Name": "CAREERS CLUSTERS NORTH AND EAST LONDON", + "Operation Summary": "To establish and manage a geographic or employer sector based Career Cluster that aims to improve the labour market relevance of education", + "Operation Start Date": "2016-06-30T22:00:00.000Z", + "Operation End Date": "2019-03-30T21:00:00.000Z", + "Total Eligible Expenditure Allocated to the Operation;Original": 666666, + "Total Eligible Expenditure Allocated to the Operation;Current": 866664, + "Union co‑financing rate, as per priority axis;": 0.5, + "Operation postcode; or other appropriate location indicator;": "LONDON LEP More Developed", + "Country": "UK", + "Category of Intervention": "02     Sustainable integration of young people not in employment, education or training in the labour market", + "Last updated": "2018-12-13T21:00:00.000Z" +} diff --git a/services/ingestion/etl/2014uk16rfop001/xls/test/stubs/ESIF/record.json b/services/ingestion/etl/2014uk16rfop001/xls/test/stubs/ESIF/record.json new file mode 100644 index 000000000..a813d23f3 --- /dev/null +++ b/services/ingestion/etl/2014uk16rfop001/xls/test/stubs/ESIF/record.json @@ -0,0 +1,16 @@ +{ + "Recipient of funds": "Hethel Innovation Ltd", + "Name of Project": "Breakthrough", + "Type of fund": "ERDF", + "Priority Axis": "1", + "Summary of project (max 100 words)": "BREAKTHROUGH creates unique environments that stimulate innovation, by leading diverse and cross-cutting teams on expeditions to new cross-cluster market opportunities. ", + "Start date": "2018-12-31T21:00:00.000Z", + "End date": "2021-12-30T21:00:00.000Z", + "ERDF/ESF investment £m": 598176, + "Total project costs £m": 1196352, + "% of project funded by EU": 0.5, + "Location (postcode)": "NR14 8FB", + "Local Enterprise Partnership area": "New Anglia", + "Country": "England", + "Type and focus of support (*Category of intervention)*": "065 Research and Innovation processes, technology transfer and c…" +} diff --git a/services/ingestion/etl/2014uk16rfop001/xls/test/unit/events/onParseXLS.spec.js b/services/ingestion/etl/2014uk16rfop001/xls/test/unit/events/onParseXLS.spec.js new file mode 100644 index 000000000..2dad09746 --- /dev/null +++ b/services/ingestion/etl/2014uk16rfop001/xls/test/unit/events/onParseXLS.spec.js @@ -0,0 +1,20 @@ +/** + * @jest-environment node + */ + +import onParseXLS from '../../../src/events/onParseXLS'; + +describe(`Function onParseXLS in "@eubfr/ingestion-etl-2014uk16rfop001-xls"`, () => { + test('The function requires BUCKET, REGION and STAGE environment variables', async () => { + const event = {}; + const context = {}; + + try { + await onParseXLS(event, context); + } catch (error) { + expect(error.message).toEqual( + 'BUCKET, REGION and STAGE environment variables are required!' + ); + } + }); +}); diff --git a/services/ingestion/etl/2014uk16rfop001/xls/test/unit/lib/__snapshots__/transform.spec.js.snap b/services/ingestion/etl/2014uk16rfop001/xls/test/unit/lib/__snapshots__/transform.spec.js.snap new file mode 100644 index 000000000..8c76a99a6 --- /dev/null +++ b/services/ingestion/etl/2014uk16rfop001/xls/test/unit/lib/__snapshots__/transform.spec.js.snap @@ -0,0 +1,180 @@ +// Jest Snapshot v1, https://goo.gl/fbAQLP + +exports[`XLS transformers for 2014uk16rfop001 Type ESF: produces correct JSON output structure 1`] = ` +Object { + "action": "", + "budget": Object { + "eu_contrib": Object { + "currency": "GBP", + "raw": 0.5, + "value": 433332, + }, + "funding_area": Array [], + "mmf_heading": "", + "other_contrib": Object { + "currency": "", + "raw": "", + "value": 0, + }, + "private_fund": Object { + "currency": "", + "raw": "", + "value": 0, + }, + "public_fund": Object { + "currency": "", + "raw": "", + "value": 0, + }, + "total_cost": Object { + "currency": "GBP", + "raw": 866664, + "value": 866664, + }, + }, + "call_year": "", + "complete": false, + "description": "To establish and manage a geographic or employer sector based Career Cluster that aims to improve the labour market relevance of education", + "ec_priorities": Array [], + "media": Array [], + "programme_name": "", + "project_id": "c4e03cd67eedb94a8ecf3df27f16dfbf", + "project_locations": Array [ + Object { + "address": "", + "centroid": null, + "country_code": "GB", + "location": null, + "nuts": Array [], + "postal_code": "", + "region": "LONDON", + "town": "", + }, + ], + "project_website": "", + "related_links": Array [], + "reporting_organisation": "Member states", + "results": Object { + "available": "", + "result": "", + }, + "status": "", + "sub_programme_name": "", + "success_story": "", + "themes": Array [ + "02     Sustainable integration of young people not in employment, education or training in the labour market", + ], + "third_parties": Array [ + Object { + "address": "", + "country": "GB", + "email": "", + "name": "15BILLION", + "phone": "", + "region": "", + "role": "Beneficiary", + "type": "", + "website": "", + }, + ], + "timeframe": Object { + "from": "2016-06-30T22:00:00.000Z", + "from_precision": "day", + "to": "2019-03-30T21:00:00.000Z", + "to_precision": "day", + }, + "title": "CAREERS CLUSTERS NORTH AND EAST LONDON", + "type": Array [], +} +`; + +exports[`XLS transformers for 2014uk16rfop001 Type ESIF: produces correct JSON output structure 1`] = ` +Object { + "action": "", + "budget": Object { + "eu_contrib": Object { + "currency": "GBP", + "raw": 598176, + "value": 598176, + }, + "funding_area": Array [], + "mmf_heading": "", + "other_contrib": Object { + "currency": "", + "raw": "", + "value": 0, + }, + "private_fund": Object { + "currency": "", + "raw": "", + "value": 0, + }, + "public_fund": Object { + "currency": "", + "raw": "", + "value": 0, + }, + "total_cost": Object { + "currency": "GBP", + "raw": 1196352, + "value": 1196352, + }, + }, + "call_year": "", + "complete": false, + "description": "Type of fund: ERDF +Priority Axis: 1 +Summary of project (max 100 words): BREAKTHROUGH creates unique environments that stimulate innovation, by leading diverse and cross-cutting teams on expeditions to new cross-cluster market opportunities.  +", + "ec_priorities": Array [], + "media": Array [], + "programme_name": "", + "project_id": "176457b26915b02e6f2fc20b2b7af749", + "project_locations": Array [ + Object { + "address": "", + "centroid": null, + "country_code": "GB", + "location": null, + "nuts": Array [], + "postal_code": "NR14 8FB", + "region": "New Anglia", + "town": "", + }, + ], + "project_website": "", + "related_links": Array [], + "reporting_organisation": "Member states", + "results": Object { + "available": "", + "result": "", + }, + "status": "", + "sub_programme_name": "", + "success_story": "", + "themes": Array [ + "065 Research and Innovation processes, technology transfer and c…", + ], + "third_parties": Array [ + Object { + "address": "", + "country": "GB", + "email": "", + "name": "Hethel Innovation Ltd", + "phone": "", + "region": "", + "role": "Beneficiary", + "type": "", + "website": "", + }, + ], + "timeframe": Object { + "from": "2018-12-31T21:00:00.000Z", + "from_precision": "day", + "to": "2021-12-30T21:00:00.000Z", + "to_precision": "day", + }, + "title": "Breakthrough", + "type": Array [], +} +`; diff --git a/services/ingestion/etl/2014uk16rfop001/xls/test/unit/lib/transform.spec.js b/services/ingestion/etl/2014uk16rfop001/xls/test/unit/lib/transform.spec.js new file mode 100644 index 000000000..a8b16e66a --- /dev/null +++ b/services/ingestion/etl/2014uk16rfop001/xls/test/unit/lib/transform.spec.js @@ -0,0 +1,32 @@ +/** + * @jest-environment node + */ + +import mapperESF from '../../../src/lib/transform/ESF/transform'; +import mapperESIF from '../../../src/lib/transform/ESIF/transform'; + +import testRecordESF from '../../stubs/ESF/record'; +import testRecordESIF from '../../stubs/ESIF/record'; + +describe('XLS transformers for 2014uk16rfop001', () => { + let esf = {}; + let esif = {}; + + beforeAll(() => { + esf = mapperESF(testRecordESF); + esif = mapperESIF(testRecordESIF); + }); + + test('Both types return null when record is not provided', () => { + expect(mapperESF()).toBe(null); + expect(mapperESIF()).toBe(null); + }); + + test('Type ESF: produces correct JSON output structure', () => { + expect(esf).toMatchSnapshot(); + }); + + test('Type ESIF: produces correct JSON output structure', () => { + expect(esif).toMatchSnapshot(); + }); +}); diff --git a/services/ingestion/etl/2014uk16rfop001/xls/webpack.config.js b/services/ingestion/etl/2014uk16rfop001/xls/webpack.config.js new file mode 100644 index 000000000..30fd8ced7 --- /dev/null +++ b/services/ingestion/etl/2014uk16rfop001/xls/webpack.config.js @@ -0,0 +1,32 @@ +const slsw = require('serverless-webpack'); +const path = require('path'); + +module.exports = { + entry: slsw.lib.entries, + target: 'node', + mode: slsw.lib.webpack.isLocal ? 'development' : 'production', + optimization: { + minimize: process.env.EUBFR_ENV && process.env.EUBFR_ENV === 'prod', + }, + devtool: 'nosources-source-map', + externals: [{ 'aws-sdk': true }], + module: { + rules: [ + { + test: /\.js$/, + use: [ + { + loader: 'babel-loader', + }, + ], + include: __dirname, + exclude: /node_modules/, + }, + ], + }, + output: { + libraryTarget: 'commonjs2', + path: path.join(__dirname, '.webpack'), + filename: '[name].js', + }, +}; diff --git a/tools/eubfr-cli/lib/getServices.js b/tools/eubfr-cli/lib/getServices.js index a49b7899f..01f00862c 100644 --- a/tools/eubfr-cli/lib/getServices.js +++ b/tools/eubfr-cli/lib/getServices.js @@ -18,6 +18,9 @@ const allServices = [ { service: 'ingestion-etl-2014tc16rfcb047-xls', exportEnv: false }, { service: 'ingestion-etl-2014tc16rfpc001-xls', exportEnv: false }, { service: 'ingestion-etl-2014tc16rftn002-xls', exportEnv: false }, + { service: 'ingestion-etl-2014uk16rfop001-xls', exportEnv: false }, + { service: 'ingestion-etl-2014uk16rfop001-csv', exportEnv: false }, + { service: 'ingestion-etl-2014uk16rfop001-ods', exportEnv: false }, { service: 'ingestion-etl-bulgaria-xls', exportEnv: false }, { service: 'ingestion-etl-cordis-csv', exportEnv: false }, { service: 'ingestion-etl-devco-xls', exportEnv: false },