Skip to content

Commit 9d24ea3

Browse files
committed
Initial commit
1 parent 441855e commit 9d24ea3

File tree

4 files changed

+256
-46
lines changed

4 files changed

+256
-46
lines changed

.github/workflows/docker-publish.yml

Lines changed: 14 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -1,63 +1,33 @@
1-
name: Docker
2-
3-
# This workflow uses actions that are not certified by GitHub.
4-
# They are provided by a third-party and are governed by
5-
# separate terms of service, privacy policy, and support
6-
# documentation.
1+
name: Publish Docker image
72

83
on:
9-
schedule:
10-
- cron: '31 5 * * *'
11-
push:
12-
branches: [ main ]
13-
# Publish semver tags as releases.
14-
tags: [ 'v*.*.*' ]
15-
pull_request:
16-
branches: [ main ]
17-
18-
env:
19-
# Use docker.io for Docker Hub if empty
20-
REGISTRY: ghcr.io
21-
# github.repository as <account>/<repo>
22-
IMAGE_NAME: ${{ github.repository }}
23-
4+
release:
5+
types: [published]
246

257
jobs:
26-
build:
27-
8+
push_to_registry:
9+
name: Push Docker image to Docker Hub
2810
runs-on: ubuntu-latest
29-
permissions:
30-
contents: read
31-
packages: write
32-
3311
steps:
34-
- name: Checkout repository
12+
- name: Check out the repo
3513
uses: actions/checkout@v2
3614

37-
# Login against a Docker registry except on PR
38-
# https://github.com/docker/login-action
39-
- name: Log into registry ${{ env.REGISTRY }}
40-
if: github.event_name != 'pull_request'
41-
uses: docker/login-action@28218f9b04b4f3f62068d7b6ce6ca5b26e35336c
15+
- name: Log in to Docker Hub
16+
uses: docker/login-action@f054a8b539a109f9f41c372932f1ae047eff08c9
4217
with:
43-
registry: ${{ env.REGISTRY }}
44-
username: ${{ github.actor }}
45-
password: ${{ secrets.GITHUB_TOKEN }}
18+
username: ${{ secrets.DOCKER_USERNAME }}
19+
password: ${{ secrets.DOCKER_PASSWORD }}
4620

47-
# Extract metadata (tags, labels) for Docker
48-
# https://github.com/docker/metadata-action
49-
- name: Extract Docker metadata
21+
- name: Extract metadata (tags, labels) for Docker
5022
id: meta
5123
uses: docker/metadata-action@98669ae865ea3cffbcbaa878cf57c20bbf1c6c38
5224
with:
53-
images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
25+
images: dirkscgm/pyspark3
5426

55-
# Build and push Docker image with Buildx (don't push on PR)
56-
# https://github.com/docker/build-push-action
5727
- name: Build and push Docker image
5828
uses: docker/build-push-action@ad44023a93711e3deb337508980b4b5e9bcdc5dc
5929
with:
6030
context: .
61-
push: ${{ github.event_name != 'pull_request' }}
31+
push: true
6232
tags: ${{ steps.meta.outputs.tags }}
63-
labels: ${{ steps.meta.outputs.labels }}
33+
labels: ${{ steps.meta.outputs.labels }}

.gitignore

Lines changed: 220 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,114 @@
1+
2+
# Created by https://www.toptal.com/developers/gitignore/api/python,intellij,spark
3+
# Edit at https://www.toptal.com/developers/gitignore?templates=python,intellij,spark
4+
5+
### Intellij ###
6+
# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider
7+
# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
8+
9+
# User-specific stuff
10+
.idea/**/workspace.xml
11+
.idea/**/tasks.xml
12+
.idea/**/usage.statistics.xml
13+
.idea/**/dictionaries
14+
.idea/**/shelf
15+
16+
# AWS User-specific
17+
.idea/**/aws.xml
18+
19+
# Generated files
20+
.idea/**/contentModel.xml
21+
22+
# Sensitive or high-churn files
23+
.idea/**/dataSources/
24+
.idea/**/dataSources.ids
25+
.idea/**/dataSources.local.xml
26+
.idea/**/sqlDataSources.xml
27+
.idea/**/dynamic.xml
28+
.idea/**/uiDesigner.xml
29+
.idea/**/dbnavigator.xml
30+
31+
# Gradle
32+
.idea/**/gradle.xml
33+
.idea/**/libraries
34+
35+
# Gradle and Maven with auto-import
36+
# When using Gradle or Maven with auto-import, you should exclude module files,
37+
# since they will be recreated, and may cause churn. Uncomment if using
38+
# auto-import.
39+
# .idea/artifacts
40+
# .idea/compiler.xml
41+
# .idea/jarRepositories.xml
42+
# .idea/modules.xml
43+
# .idea/*.iml
44+
# .idea/modules
45+
# *.iml
46+
# *.ipr
47+
48+
# CMake
49+
cmake-build-*/
50+
51+
# Mongo Explorer plugin
52+
.idea/**/mongoSettings.xml
53+
54+
# File-based project format
55+
*.iws
56+
57+
# IntelliJ
58+
out/
59+
60+
# mpeltonen/sbt-idea plugin
61+
.idea_modules/
62+
63+
# JIRA plugin
64+
atlassian-ide-plugin.xml
65+
66+
# Cursive Clojure plugin
67+
.idea/replstate.xml
68+
69+
# Crashlytics plugin (for Android Studio and IntelliJ)
70+
com_crashlytics_export_strings.xml
71+
crashlytics.properties
72+
crashlytics-build.properties
73+
fabric.properties
74+
75+
# Editor-based Rest Client
76+
.idea/httpRequests
77+
78+
# Android studio 3.1+ serialized cache file
79+
.idea/caches/build_file_checksums.ser
80+
81+
### Intellij Patch ###
82+
# Comment Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-215987721
83+
84+
# *.iml
85+
# modules.xml
86+
# .idea/misc.xml
87+
# *.ipr
88+
89+
# Sonarlint plugin
90+
# https://plugins.jetbrains.com/plugin/7973-sonarlint
91+
.idea/**/sonarlint/
92+
93+
# SonarQube Plugin
94+
# https://plugins.jetbrains.com/plugin/7238-sonarqube-community-plugin
95+
.idea/**/sonarIssues.xml
96+
97+
# Markdown Navigator plugin
98+
# https://plugins.jetbrains.com/plugin/7896-markdown-navigator-enhanced
99+
.idea/**/markdown-navigator.xml
100+
.idea/**/markdown-navigator-enh.xml
101+
.idea/**/markdown-navigator/
102+
103+
# Cache file creation bug
104+
# See https://youtrack.jetbrains.com/issue/JBR-2257
105+
.idea/$CACHE_FILE$
106+
107+
# CodeStream plugin
108+
# https://plugins.jetbrains.com/plugin/12206-codestream
109+
.idea/codestream.xml
110+
111+
### Python ###
1112
# Byte-compiled / optimized / DLL files
2113
__pycache__/
3114
*.py[cod]
@@ -20,7 +131,6 @@ parts/
20131
sdist/
21132
var/
22133
wheels/
23-
pip-wheel-metadata/
24134
share/python-wheels/
25135
*.egg-info/
26136
.installed.cfg
@@ -50,6 +160,7 @@ coverage.xml
50160
*.py,cover
51161
.hypothesis/
52162
.pytest_cache/
163+
cover/
53164

54165
# Translations
55166
*.mo
@@ -72,6 +183,7 @@ instance/
72183
docs/_build/
73184

74185
# PyBuilder
186+
.pybuilder/
75187
target/
76188

77189
# Jupyter Notebook
@@ -82,7 +194,9 @@ profile_default/
82194
ipython_config.py
83195

84196
# pyenv
85-
.python-version
197+
# For a library or package, you might want to ignore these files since the code is
198+
# intended to run in multiple environments; otherwise, check them in:
199+
# .python-version
86200

87201
# pipenv
88202
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
@@ -127,3 +241,107 @@ dmypy.json
127241

128242
# Pyre type checker
129243
.pyre/
244+
245+
# pytype static type analyzer
246+
.pytype/
247+
248+
# Cython debug symbols
249+
cython_debug/
250+
251+
### Spark ###
252+
*#*#
253+
*.#*
254+
*.iml
255+
*.ipr
256+
*.pyc
257+
*.pyo
258+
*.swp
259+
*~
260+
.DS_Store
261+
.classpath
262+
.ensime
263+
.ensime_cache/
264+
.ensime_lucene
265+
.generated-mima*
266+
.idea/
267+
.project
268+
.pydevproject
269+
.scala_dependencies
270+
.settings
271+
/lib/
272+
R-unit-tests.log
273+
R/unit-tests.out
274+
R/cran-check.out
275+
R/pkg/vignettes/sparkr-vignettes.html
276+
R/pkg/tests/fulltests/Rplots.pdf
277+
build/*.jar
278+
build/apache-maven*
279+
build/scala*
280+
build/zinc*
281+
cache
282+
checkpoint
283+
conf/*.cmd
284+
conf/*.conf
285+
conf/*.properties
286+
conf/*.sh
287+
conf/*.xml
288+
conf/java-opts
289+
conf/slaves
290+
dependency-reduced-pom.xml
291+
derby.log
292+
dev/create-release/*final
293+
dev/create-release/*txt
294+
dev/pr-deps/
295+
docs/_site
296+
docs/api
297+
sql/docs
298+
sql/site
299+
lib_managed/
300+
lint-r-report.log
301+
log/
302+
logs/
303+
project/boot/
304+
project/build/target/
305+
project/plugins/lib_managed/
306+
project/plugins/project/build.properties
307+
project/plugins/src_managed/
308+
project/plugins/target/
309+
python/lib/pyspark.zip
310+
python/deps
311+
python/test_coverage/coverage_data
312+
python/test_coverage/htmlcov
313+
python/pyspark/python
314+
reports/
315+
scalastyle-on-compile.generated.xml
316+
scalastyle-output.xml
317+
scalastyle.txt
318+
spark-*-bin-*.tgz
319+
spark-tests.log
320+
src_managed/
321+
streaming-tests.log
322+
unit-tests.log
323+
work/
324+
docs/.jekyll-metadata
325+
326+
# For Hive
327+
TempStatsStore/
328+
metastore/
329+
metastore_db/
330+
sql/hive-thriftserver/test_warehouses
331+
warehouse/
332+
spark-warehouse/
333+
334+
# For R session data
335+
.RData
336+
.RHistory
337+
.Rhistory
338+
*.Rproj
339+
*.Rproj.*
340+
341+
.Rproj.user
342+
343+
# For SBT
344+
.jvmopts
345+
346+
347+
# End of https://www.toptal.com/developers/gitignore/api/python,intellij,spark

Dockerfile

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
FROM ubuntu:18.04
2+
3+
ENV SPARK_VERSION=3.1.2
4+
ENV HADOOP_VERSION=2.7
5+
ENV JAVA_HOME /usr/lib/jvm/java-8-openjdk-amd64/
6+
7+
COPY requirements.txt .
8+
RUN apt-get update \
9+
&& apt-get install -y python3 python3-pip wget software-properties-common openjdk-8-jdk \
10+
&& export JAVA_HOME \
11+
&& pip3 install --upgrade pip \
12+
&& pip3 install -r requirements.txt \
13+
&& wget --no-verbose http://apache.mirror.iphh.net/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz \
14+
&& tar -xvzf spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz \
15+
&& mv spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION} spark \
16+
&& rm spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz

requirements.txt

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
Cython==0.29.24
2+
pyspark==3.0.3
3+
pandas==1.3.4
4+
requests==2.26.0
5+
awscli==1.22.17
6+
boto3==1.20.17

0 commit comments

Comments
 (0)