Merge pull request #15 from meeehow/docs

meeehow · web-flow · commit 749cb98f7ee9 · 2022-09-12T08:45:29.000+02:00
Update docs
diff --git a/exporters/gcp/gcp.go b/exporters/gcp/gcp.go
@@ -20,7 +20,7 @@ import (
 
 const (
 	// Name contains name of the exporter.
-	Name = "gcp"
+	Name = "GCP"
 )
 
 // Exporter is an instance of GCP Exporter.
diff --git a/exporters/postgres/postgres.go b/exporters/postgres/postgres.go
@@ -34,7 +34,7 @@ import (
 
 const (
 	// Name contains name of the exporter.
-	Name = "postgre"
+	Name = "postgres"
 )
 
 // Exporter is an instance of Postgres Exporter.
diff --git a/hashr.go b/hashr.go
@@ -40,18 +40,18 @@ import (
 )
 
 var (
-	processingWorkerCount   = flag.Int("processing_worker_count", 2, "Number of processing workers.")
-	importersToRun          = flag.String("importers", strings.Join([]string{}, ","), fmt.Sprintf("Importers to be run: %s,%s,%s,%s", gcp.RepoName, targz.RepoName, windows.RepoName, wsus.RepoName))
-	exportersToRun          = flag.String("exporters", strings.Join([]string{}, ","), fmt.Sprintf("Exporters to be run: %s,%s", gcpExporter.Name, postgresExporter.Name))
-	jobStorage              = flag.String("storage", "", "Storage that should be used for storing data about processing jobs, can have one of the two values: postgres, cloudspanner")
-	cacheDir                = flag.String("cache_dir", "/tmp/", "Path to cache dir used to store local cache.")
-	export                  = flag.Bool("export", true, "Whether to export samples, otherwise, they'll be saved to disk")
-	exportPath              = flag.String("export_path", "/tmp/hashr-uploads", "If export is set to false, this is the folder where samples will be saved.")
-	reprocess               = flag.String("reprocess", "", "Sha256 of sources that should be reprocessed")
-	spannerDBPath           = flag.String("spanner_db_path", "", "Path to spanner DB.")
-	uploadPayloads          = flag.Bool("upload_payloads", false, "If true the content of the files will be uploaded using defined exporters.")
-	cloudSpannerWorkerCount = flag.Int("cloudspanner_worker_count", 100, "Number of workers/goroutines that will be used to upload data to Cloud Spanner.")
-	gcpExporterGCSbucket    = flag.String("gcp_exporter_gcs_bucket", "", "Name of the GCS bucket which will be used by GCP exporter to store exported samples.")
+	processingWorkerCount  = flag.Int("processing_worker_count", 2, "Number of processing workers.")
+	importersToRun         = flag.String("importers", strings.Join([]string{}, ","), fmt.Sprintf("Importers to be run: %s,%s,%s,%s", gcp.RepoName, targz.RepoName, windows.RepoName, wsus.RepoName))
+	exportersToRun         = flag.String("exporters", strings.Join([]string{}, ","), fmt.Sprintf("Exporters to be run: %s,%s", gcpExporter.Name, postgresExporter.Name))
+	jobStorage             = flag.String("storage", "", "Storage that should be used for storing data about processing jobs, can have one of the two values: postgres, cloudspanner")
+	cacheDir               = flag.String("cache_dir", "/tmp/", "Path to cache dir used to store local cache.")
+	export                 = flag.Bool("export", true, "Whether to export samples, otherwise, they'll be saved to disk")
+	exportPath             = flag.String("export_path", "/tmp/hashr-uploads", "If export is set to false, this is the folder where samples will be saved.")
+	reprocess              = flag.String("reprocess", "", "Sha256 of sources that should be reprocessed")
+	spannerDBPath          = flag.String("spanner_db_path", "", "Path to spanner DB.")
+	uploadPayloads         = flag.Bool("upload_payloads", false, "If true the content of the files will be uploaded using defined exporters.")
+	gcpExporterWorkerCount = flag.Int("gcp_exporter_worker_count", 100, "Number of workers/goroutines that will be used to upload data to Cloud Spanner.")
+	gcpExporterGCSbucket   = flag.String("gcp_exporter_gcs_bucket", "", "Name of the GCS bucket which will be used by GCP exporter to store exported samples.")
 
 	// Postgres DB flags
 	postgresHost     = flag.String("postgres_host", "localhost", "PostgreSQL instance address.")
@@ -156,7 +156,7 @@ func main() {
 				glog.Exitf("Could not initialize GCP Storage client: %v", err)
 			}
 
-			gceExporter, err := gcpExporter.NewExporter(spannerClient, storageClient, *gcpExporterGCSbucket, *uploadPayloads, *cloudSpannerWorkerCount)
+			gceExporter, err := gcpExporter.NewExporter(spannerClient, storageClient, *gcpExporterGCSbucket, *uploadPayloads, *gcpExporterWorkerCount)
 			if err != nil {
 				glog.Exitf("Error initializing Postgres exporter: %v", err)
 			}
diff --git a/readme.md b/readme.md
@@ -21,7 +21,7 @@
       - [WSUS](#wsus)
     - [Setting up exporters](#setting-up-exporters)
       - [Setting up Postgres exporter](#setting-up-postgres-exporter)
-      - [Setting up Cloud Spanner exporter](#setting-up-cloud-spanner-exporter)
+      - [Setting up GCP exporter](#setting-up-gcp-exporter)
     - [Additional flags](#additional-flags)
 
 ## About 
@@ -369,27 +369,41 @@ If you didn't choose Postgres for processing job storage follow steps 1 & 2 from
 
 This is currently the default exporter, you don't need to explicitly enable it. By default the content of the actual files won't be uploaded to PostgreSQL DB, if you wish to change that use `-upload_payloads true` flag. 
 
-In order for the Postgres exporter to work you need to set the following flags: `-postgresHost <host> -postgresPort <port> -postgresUser <user> -postgresPassword <pass> -postgresDBName <db_name>`
+In order for the Postgres exporter to work you need to set the following flags: `-exporters postgres -postgresHost <host> -postgresPort <port> -postgresUser <user> -postgresPassword <pass> -postgresDBName <db_name>`
 
-#### Setting up Cloud Spanner exporter 
+#### Setting up GCP exporter 
 
-Cloud Spanner exporter allows sending of hashes, file metadata and the actual content of the file to a GCP Spanner instance. If you haven't set up Cloud Spanner for storing processing jobs, follow the steps in [Setting up Cloud Spanner](####setting-up-cloud-spanner) and instead of the last step run the following command to create necessary tables: 
+GCP exporter allows sending of hashes, file metadata to GCP Spanner instance. Optionally you can upload the extracted files to GCS bucket. If you haven't set up Cloud Spanner for storing processing jobs, follow the steps in [Setting up Cloud Spanner](####setting-up-cloud-spanner) and instead of the last step run the following command to create necessary tables: 
 
 ``` shell
 gcloud spanner databases ddl update hashr --instance=hashr --ddl-file=scripts/CreateCloudSpannerExporterTables.ddl
 ```
 
 If you have already set up Cloud Spanner for storing jobs data you just need to the run the command above and you're ready to go. 
 
+If you'd like to upload the extracted files to GCS you need to create the GCS bucket:
+
+Step 1: Make the service account admin of this bucket: 
+``` shell
+gsutil mb -p project_name> gs://<gcs_bucket_name>
+```
+
+Step 2: Make the service account admin of this bucket: 
+``` shell
+gsutil iam ch serviceAccount:hashr@<project_name>.iam.gserviceaccount.com:objectAdmin gs://<gcs_bucket_name>
+```
+
+To use this exporter you need to provide the following flags: `-exporters GCP -gcp_exporter_gcs_bucket <gcs_bucket_name>`
+
 ### Additional flags
 
-1. `-processingWorkerCount`: This flag controls number of parallel processing workers. Processing is CPU and I/O heavy, during my testing I found that having 2 workers is the most optimal solution. 
-1. `-cacheDir`: Location of local cache used for deduplication, it's advised to change that from `/tmp` to e.g. home directory of the user that will be running hashr.
+1. `-processing_worker_count`: This flag controls number of parallel processing workers. Processing is CPU and I/O heavy, during my testing I found that having 2 workers is the most optimal solution. 
+1. `-cache_dir`: Location of local cache used for deduplication, it's advised to change that from `/tmp` to e.g. home directory of the user that will be running hashr.
 1. `-export`: When set to false hashr will save the results to disk bypassing the exporter. 
-1. `-exportPath`: If export is set to false, this is the folder where samples will be saved.
+1. `-export_path`: If export is set to false, this is the folder where samples will be saved.
 1. `-reprocess`: Allows to reprocess a given source (in case it e.g. errored out) based on the sha256 value stored in the jobs table. 
-1. `-uploadPayloads`: Controls if the actual content of the file will be uploaded by defined exporters.
-2. `-cloudSpannerWorkerCount`: Number of workers/goroutines that will be used to upload data to Cloud Spanner.
+1. `-upload_payloads`: Controls if the actual content of the file will be uploaded by defined exporters.
+2. `-gcp_exporter_worker_count`: Number of workers/goroutines that the GCP exporter will use to upload the data.
 
 
 This is not an officially supported Google product.

Original file line number	Diff line number	Diff line change
`@@ -20,7 +20,7 @@ import (`
`20`	`20`
`21`	`21`	`const (`
`22`	`22`	`// Name contains name of the exporter.`
`23`		`- Name = "gcp"`
	`23`	`+ Name = "GCP"`
`24`	`24`	`)`
`25`	`25`
`26`	`26`	`// Exporter is an instance of GCP Exporter.`
Original file line number	Diff line number	Diff line change
`@@ -34,7 +34,7 @@ import (`
`34`	`34`
`35`	`35`	`const (`
`36`	`36`	`// Name contains name of the exporter.`
`37`		`- Name = "postgre"`
	`37`	`+ Name = "postgres"`
`38`	`38`	`)`
`39`	`39`
`40`	`40`	`// Exporter is an instance of Postgres Exporter.`