Skip to content

Commit 749cb98

Browse files
authored
Merge pull request #15 from meeehow/docs
Update docs
2 parents 7dfbce7 + aa3db08 commit 749cb98

File tree

4 files changed

+38
-24
lines changed

4 files changed

+38
-24
lines changed

exporters/gcp/gcp.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ import (
2020

2121
const (
2222
// Name contains name of the exporter.
23-
Name = "gcp"
23+
Name = "GCP"
2424
)
2525

2626
// Exporter is an instance of GCP Exporter.

exporters/postgres/postgres.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ import (
3434

3535
const (
3636
// Name contains name of the exporter.
37-
Name = "postgre"
37+
Name = "postgres"
3838
)
3939

4040
// Exporter is an instance of Postgres Exporter.

hashr.go

+13-13
Original file line numberDiff line numberDiff line change
@@ -40,18 +40,18 @@ import (
4040
)
4141

4242
var (
43-
processingWorkerCount = flag.Int("processing_worker_count", 2, "Number of processing workers.")
44-
importersToRun = flag.String("importers", strings.Join([]string{}, ","), fmt.Sprintf("Importers to be run: %s,%s,%s,%s", gcp.RepoName, targz.RepoName, windows.RepoName, wsus.RepoName))
45-
exportersToRun = flag.String("exporters", strings.Join([]string{}, ","), fmt.Sprintf("Exporters to be run: %s,%s", gcpExporter.Name, postgresExporter.Name))
46-
jobStorage = flag.String("storage", "", "Storage that should be used for storing data about processing jobs, can have one of the two values: postgres, cloudspanner")
47-
cacheDir = flag.String("cache_dir", "/tmp/", "Path to cache dir used to store local cache.")
48-
export = flag.Bool("export", true, "Whether to export samples, otherwise, they'll be saved to disk")
49-
exportPath = flag.String("export_path", "/tmp/hashr-uploads", "If export is set to false, this is the folder where samples will be saved.")
50-
reprocess = flag.String("reprocess", "", "Sha256 of sources that should be reprocessed")
51-
spannerDBPath = flag.String("spanner_db_path", "", "Path to spanner DB.")
52-
uploadPayloads = flag.Bool("upload_payloads", false, "If true the content of the files will be uploaded using defined exporters.")
53-
cloudSpannerWorkerCount = flag.Int("cloudspanner_worker_count", 100, "Number of workers/goroutines that will be used to upload data to Cloud Spanner.")
54-
gcpExporterGCSbucket = flag.String("gcp_exporter_gcs_bucket", "", "Name of the GCS bucket which will be used by GCP exporter to store exported samples.")
43+
processingWorkerCount = flag.Int("processing_worker_count", 2, "Number of processing workers.")
44+
importersToRun = flag.String("importers", strings.Join([]string{}, ","), fmt.Sprintf("Importers to be run: %s,%s,%s,%s", gcp.RepoName, targz.RepoName, windows.RepoName, wsus.RepoName))
45+
exportersToRun = flag.String("exporters", strings.Join([]string{}, ","), fmt.Sprintf("Exporters to be run: %s,%s", gcpExporter.Name, postgresExporter.Name))
46+
jobStorage = flag.String("storage", "", "Storage that should be used for storing data about processing jobs, can have one of the two values: postgres, cloudspanner")
47+
cacheDir = flag.String("cache_dir", "/tmp/", "Path to cache dir used to store local cache.")
48+
export = flag.Bool("export", true, "Whether to export samples, otherwise, they'll be saved to disk")
49+
exportPath = flag.String("export_path", "/tmp/hashr-uploads", "If export is set to false, this is the folder where samples will be saved.")
50+
reprocess = flag.String("reprocess", "", "Sha256 of sources that should be reprocessed")
51+
spannerDBPath = flag.String("spanner_db_path", "", "Path to spanner DB.")
52+
uploadPayloads = flag.Bool("upload_payloads", false, "If true the content of the files will be uploaded using defined exporters.")
53+
gcpExporterWorkerCount = flag.Int("gcp_exporter_worker_count", 100, "Number of workers/goroutines that will be used to upload data to Cloud Spanner.")
54+
gcpExporterGCSbucket = flag.String("gcp_exporter_gcs_bucket", "", "Name of the GCS bucket which will be used by GCP exporter to store exported samples.")
5555

5656
// Postgres DB flags
5757
postgresHost = flag.String("postgres_host", "localhost", "PostgreSQL instance address.")
@@ -156,7 +156,7 @@ func main() {
156156
glog.Exitf("Could not initialize GCP Storage client: %v", err)
157157
}
158158

159-
gceExporter, err := gcpExporter.NewExporter(spannerClient, storageClient, *gcpExporterGCSbucket, *uploadPayloads, *cloudSpannerWorkerCount)
159+
gceExporter, err := gcpExporter.NewExporter(spannerClient, storageClient, *gcpExporterGCSbucket, *uploadPayloads, *gcpExporterWorkerCount)
160160
if err != nil {
161161
glog.Exitf("Error initializing Postgres exporter: %v", err)
162162
}

readme.md

+23-9
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
- [WSUS](#wsus)
2222
- [Setting up exporters](#setting-up-exporters)
2323
- [Setting up Postgres exporter](#setting-up-postgres-exporter)
24-
- [Setting up Cloud Spanner exporter](#setting-up-cloud-spanner-exporter)
24+
- [Setting up GCP exporter](#setting-up-gcp-exporter)
2525
- [Additional flags](#additional-flags)
2626

2727
## About
@@ -369,27 +369,41 @@ If you didn't choose Postgres for processing job storage follow steps 1 & 2 from
369369

370370
This is currently the default exporter, you don't need to explicitly enable it. By default the content of the actual files won't be uploaded to PostgreSQL DB, if you wish to change that use `-upload_payloads true` flag.
371371

372-
In order for the Postgres exporter to work you need to set the following flags: `-postgresHost <host> -postgresPort <port> -postgresUser <user> -postgresPassword <pass> -postgresDBName <db_name>`
372+
In order for the Postgres exporter to work you need to set the following flags: `-exporters postgres -postgresHost <host> -postgresPort <port> -postgresUser <user> -postgresPassword <pass> -postgresDBName <db_name>`
373373

374-
#### Setting up Cloud Spanner exporter
374+
#### Setting up GCP exporter
375375

376-
Cloud Spanner exporter allows sending of hashes, file metadata and the actual content of the file to a GCP Spanner instance. If you haven't set up Cloud Spanner for storing processing jobs, follow the steps in [Setting up Cloud Spanner](####setting-up-cloud-spanner) and instead of the last step run the following command to create necessary tables:
376+
GCP exporter allows sending of hashes, file metadata to GCP Spanner instance. Optionally you can upload the extracted files to GCS bucket. If you haven't set up Cloud Spanner for storing processing jobs, follow the steps in [Setting up Cloud Spanner](####setting-up-cloud-spanner) and instead of the last step run the following command to create necessary tables:
377377

378378
``` shell
379379
gcloud spanner databases ddl update hashr --instance=hashr --ddl-file=scripts/CreateCloudSpannerExporterTables.ddl
380380
```
381381

382382
If you have already set up Cloud Spanner for storing jobs data you just need to the run the command above and you're ready to go.
383383

384+
If you'd like to upload the extracted files to GCS you need to create the GCS bucket:
385+
386+
Step 1: Make the service account admin of this bucket:
387+
``` shell
388+
gsutil mb -p project_name> gs://<gcs_bucket_name>
389+
```
390+
391+
Step 2: Make the service account admin of this bucket:
392+
``` shell
393+
gsutil iam ch serviceAccount:hashr@<project_name>.iam.gserviceaccount.com:objectAdmin gs://<gcs_bucket_name>
394+
```
395+
396+
To use this exporter you need to provide the following flags: `-exporters GCP -gcp_exporter_gcs_bucket <gcs_bucket_name>`
397+
384398
### Additional flags
385399

386-
1. `-processingWorkerCount`: This flag controls number of parallel processing workers. Processing is CPU and I/O heavy, during my testing I found that having 2 workers is the most optimal solution.
387-
1. `-cacheDir`: Location of local cache used for deduplication, it's advised to change that from `/tmp` to e.g. home directory of the user that will be running hashr.
400+
1. `-processing_worker_count`: This flag controls number of parallel processing workers. Processing is CPU and I/O heavy, during my testing I found that having 2 workers is the most optimal solution.
401+
1. `-cache_dir`: Location of local cache used for deduplication, it's advised to change that from `/tmp` to e.g. home directory of the user that will be running hashr.
388402
1. `-export`: When set to false hashr will save the results to disk bypassing the exporter.
389-
1. `-exportPath`: If export is set to false, this is the folder where samples will be saved.
403+
1. `-export_path`: If export is set to false, this is the folder where samples will be saved.
390404
1. `-reprocess`: Allows to reprocess a given source (in case it e.g. errored out) based on the sha256 value stored in the jobs table.
391-
1. `-uploadPayloads`: Controls if the actual content of the file will be uploaded by defined exporters.
392-
2. `-cloudSpannerWorkerCount`: Number of workers/goroutines that will be used to upload data to Cloud Spanner.
405+
1. `-upload_payloads`: Controls if the actual content of the file will be uploaded by defined exporters.
406+
2. `-gcp_exporter_worker_count`: Number of workers/goroutines that the GCP exporter will use to upload the data.
393407

394408

395409
This is not an officially supported Google product.

0 commit comments

Comments
 (0)