Skip to content

Commit

Permalink
Category descriptions (#89)
Browse files Browse the repository at this point in the history
* generated descriptions

* added categories sync

* shorter descriptions

* avoid category name repetition

* sorted arrays

* lint

* proof-read
  • Loading branch information
max-ostapenko authored Jan 15, 2025
1 parent 678cebb commit 070f0bb
Show file tree
Hide file tree
Showing 3 changed files with 187 additions and 59 deletions.
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
"lint:fix": "eslint --fix src/**/*.{js,json} tests/**/*.js scripts/**/*.js && jsonlint -isV ./schema.json --trim-trailing-commas --enforce-double-quotes ./src/technologies/ && jsonlint -is --trim-trailing-commas --enforce-double-quotes ./src/categories.json",
"validate": "yarn run lint && node ./scripts/validate.js",
"test": "jest",
"upload": "node ./scripts/upload_technology.js",
"upload": "node ./scripts/bigquery_upload.js",
"convert": "node --no-warnings ./scripts/convert.js",
"build": "yarn run validate && yarn run convert && node ./scripts/build.js"
},
Expand Down
134 changes: 77 additions & 57 deletions scripts/upload_technology.js → scripts/bigquery_upload.js
Original file line number Diff line number Diff line change
Expand Up @@ -5,52 +5,12 @@ const fs = require('fs')
const path = require('path')
const { BigQuery } = require('@google-cloud/bigquery')

const readJsonFiles = (directory) => {
const files = fs.readdirSync(directory)
return files.reduce((mergedData, file) => {
const filePath = path.join(directory, file)
const data = fs.readFileSync(filePath, 'utf8')
return { ...mergedData, ...JSON.parse(data) }
}, {})
}

const getArray = (value) =>
typeof value === 'string' ? [value] : Array.isArray(value) ? value : []

const getRuleObject = (value) => {
if (typeof value === 'string') {
return [{ name: value, value: null }]
}
if (Array.isArray(value)) {
return value.map((key) => ({ name: key, value: null }))
}
if (typeof value === 'object') {
return Object.keys(value).map((key) => ({
name: key,
value:
typeof value[key] === 'object'
? JSON.stringify(value[key])
: value[key].toString(),
}))
}
return []
}

const loadToBigQuery = async (
data,
tableName = 'apps',
datasetName = 'wappalyzer',
writeDisposition = 'WRITE_TRUNCATE',
sourceFormat = 'NEWLINE_DELIMITED_JSON'
) => {
if (!data) {
throw new Error(`No data to load to \`${datasetName}.${tableName}\`.`)
}
const bigquery = new BigQuery({
keyFilename: '/tmp/gcp_key.json',
})

const bigquery = new BigQuery({
keyFilename: '/tmp/gcp_key.json',
})
const schema = {
const schemas = {
technologies: {
fields: [
{ name: 'name', type: 'STRING' },
{ name: 'categories', type: 'STRING', mode: 'REPEATED' },
Expand Down Expand Up @@ -137,8 +97,58 @@ const loadToBigQuery = async (
{ name: 'script', type: 'STRING', mode: 'REPEATED' },
{ name: 'html', type: 'STRING', mode: 'REPEATED' },
],
},
categories: {
fields: [
{ name: 'name', type: 'STRING' },
{ name: 'description', type: 'STRING' },
],
},
}

const readJsonFiles = (directory) => {
const files = fs.readdirSync(directory)
return files.reduce((mergedData, file) => {
const filePath = path.join(directory, file)
const data = fs.readFileSync(filePath, 'utf8')
return { ...mergedData, ...JSON.parse(data) }
}, {})
}

const getArray = (value) =>
typeof value === 'string' ? [value] : Array.isArray(value) ? value.sort() : []

const getRuleObject = (value) => {
if (typeof value === 'string') {
return [{ name: value, value: null }]
}
if (Array.isArray(value)) {
return value.map((key) => ({ name: key, value: null }))
}
if (typeof value === 'object') {
return Object.keys(value).map((key) => ({
name: key,
value:
typeof value[key] === 'object'
? JSON.stringify(value[key])
: value[key].toString(),
}))
}
return []
}

const loadToBigQuery = async (
data,
tableName = 'technologies',
datasetName = 'wappalyzer',
writeDisposition = 'WRITE_TRUNCATE',
sourceFormat = 'NEWLINE_DELIMITED_JSON'
) => {
if (!data) {
throw new Error(`No data to load to \`${datasetName}.${tableName}\`.`)
}

const schema = schemas[tableName]
const options = { schema, sourceFormat, writeDisposition }
const [job] = await bigquery
.dataset(datasetName)
Expand All @@ -147,11 +157,11 @@ const loadToBigQuery = async (

if (job.status.errors && job.status.errors.length > 0) {
console.error('Errors encountered:', job.status.errors)
throw new Error('Error loading data into BigQuery')
throw new Error(`Error loading data into ${datasetName}.${tableName}`)
}

console.log(
`Loaded ${job.numRowsLoaded} rows into ${datasetName}.${tableName}...`
`Loaded ${job.statistics.load.outputRows} rows into ${datasetName}.${tableName}`
)
}

Expand All @@ -164,9 +174,9 @@ const main = async () => {
const transformedTechnologies = Object.keys(technologies).map((key) => {
const app = {
name: key,
categories: technologies[key].cats.map(
(category) => categories[category].name
),
categories: technologies[key].cats
.map((category) => categories[category].name)
.sort(),
}

;[
Expand Down Expand Up @@ -208,13 +218,23 @@ const main = async () => {
const transformedTechnologiesJsonL = transformedTechnologies
.map((line) => JSON.stringify(line))
.join('\n')
const filePath = './transformedTechnologies.jsonl'
fs.writeFileSync(filePath, transformedTechnologiesJsonL)

await loadToBigQuery(filePath, 'apps')

// cleanup file
fs.unlinkSync(filePath)
const technologiesFilePath = './transformedTechnologies.jsonl'
fs.writeFileSync(technologiesFilePath, transformedTechnologiesJsonL)
await loadToBigQuery(technologiesFilePath, 'technologies')
fs.unlinkSync(technologiesFilePath)

const transformedCategoriesJsonL = Object.values(categories)
.map((value) =>
JSON.stringify({
name: value.name,
description: value.description,
})
)
.join('\n')
const categoriesFilePath = './transformedCategories.jsonl'
fs.writeFileSync(categoriesFilePath, transformedCategoriesJsonL)
await loadToBigQuery(categoriesFilePath, 'categories')
fs.unlinkSync(categoriesFilePath)
}

main().catch(console.error)
Loading

0 comments on commit 070f0bb

Please sign in to comment.