Add yaml pipeline examples

akvelon · Mar 13, 2024 · 0a6bf97 · 0a6bf97
1 parent 483e3be
commit 0a6bf97
Show file tree

Hide file tree

Showing 5 changed files with 177 additions and 0 deletions.
diff --git a/Python/yaml/README.md b/Python/yaml/README.md
@@ -0,0 +1,23 @@
+## Running pipelines
+
+The Beam yaml parser is currently included as part of the Apache Beam Python SDK.
+This can be installed (e.g. within a virtual environment) as
+
+```
+pip install apache_beam[yaml,gcp]
+```
+
+In addition, several of the provided transforms (such as SQL) are implemented
+in Java and their expansion will require a working Java interpeter. (The
+requisite artifacts will be automatically downloaded from the apache maven
+repositories, so no further installs will be required.)
+Docker is also currently required for local execution of these
+cross-language-requiring transforms, but not for submission to a non-local
+runner such as Flink or Dataflow.
+
+Once the prerequisites are installed, you can execute a pipeline defined
+in a yaml file as
+
+```
+python -m apache_beam.yaml.main --yaml_pipeline_file=/path/to/pipeline.yaml [other pipeline options such as the runner]
+```
diff --git a/Python/yaml/bigquery_to_kafka.yaml b/Python/yaml/bigquery_to_kafka.yaml
@@ -0,0 +1,28 @@
+#  Copyright 2024 Google LLC
+
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+
+#       https://www.apache.org/licenses/LICENSE-2.0
+
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+pipeline:
+  type: chain
+
+  source:
+    type: ReadFromBigQuery
+    config:
+      table: project-id.dataset-id.table-id
+
+  sink:
+    type: WriteToKafka
+    config:
+      bootstrapServers: localhost:9092
+      topic: topic-name
+      format: JSON
diff --git a/Python/yaml/kafka_to_bigquery.yaml b/Python/yaml/kafka_to_bigquery.yaml
@@ -0,0 +1,48 @@
+#  Copyright 2024 Google LLC
+
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+
+#       https://www.apache.org/licenses/LICENSE-2.0
+
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+pipeline:
+  type: chain
+
+  source:
+    type: ReadFromKafka
+    config:
+      bootstrapServers: localhost:9092
+      topic: topic-name
+      format: RAW
+
+  transforms:
+    - type: MapToFields
+      config:
+        language: python
+        fields:
+          message:
+            callable: "lambda row: row.payload.decode('utf-8')"
+
+  sink:
+    type: WriteToBigQuery
+    config:
+      table: project-id.dataset-id.tabe-id
+      create_disposition: CREATE_IF_NEEDED
+      write_disposition: WRITE_APPEND
+    windowing:
+      type: fixed
+      size: 60s
+
+options:
+  streaming: true
+  runner: DataflowRunner
+  project: project-id
+  region: us-central1
+  temp_location: gs://path/to/temp/folder
diff --git a/Python/yaml/kafka_to_json.yaml b/Python/yaml/kafka_to_json.yaml
@@ -0,0 +1,46 @@
+#  Copyright 2024 Google LLC
+
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+
+#       https://www.apache.org/licenses/LICENSE-2.0
+
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+pipeline:
+  type: chain
+
+  source:
+    type: ReadFromKafka
+    config:
+      bootstrapServers: localhost:9092
+      topic: topic-name
+      format: RAW
+
+  transforms:
+    - type: MapToFields
+      config:
+        language: python
+        fields:
+          message:
+            callable: "lambda row: row.payload.decode('utf-8')"
+
+  sink:
+    type: WriteToJson
+    config:
+      path: gs://path/to/output.json
+    windowing:
+      type: fixed
+      size: 60s
+
+options:
+  streaming: true
+  runner: DataflowRunner
+  project: project-id
+  region: us-central1
+  temp_location: gs://path/to/temp/folder
diff --git a/Python/yaml/test_csv_to_json.yaml b/Python/yaml/test_csv_to_json.yaml
@@ -0,0 +1,32 @@
+#  Copyright 2024 Google LLC
+
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+
+#       https://www.apache.org/licenses/LICENSE-2.0
+
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+pipeline:
+  transforms:
+    - type: ReadFromCsv
+      config:
+        path: /path/to/input*.csv
+    - type: Filter
+      config:
+        language: python
+        keep: "col2 > 100"
+      input: ReadFromCsv
+    - type: Sql
+      config:
+        query: "select col1, count(*) as cnt from PCOLLECTION group by col1"
+      input: Filter
+    - type: WriteToJson
+      config:
+        path: /path/to/output.json
+      input: Sql