Add RAW data with new architecture
This commit is contained in:
parent
87f4dfdb0b
commit
1e42b00ce7
21 changed files with 272 additions and 55 deletions
8
.idea/.gitignore
generated
vendored
Normal file
8
.idea/.gitignore
generated
vendored
Normal file
|
|
@ -0,0 +1,8 @@
|
|||
# Default ignored files
|
||||
/shelf/
|
||||
/workspace.xml
|
||||
# Editor-based HTTP Client requests
|
||||
/httpRequests/
|
||||
# Datasource local storage ignored files
|
||||
/dataSources/
|
||||
/dataSources.local.xml
|
||||
1
.idea/.name
generated
Normal file
1
.idea/.name
generated
Normal file
|
|
@ -0,0 +1 @@
|
|||
PySpark Training Repository
|
||||
10
.idea/PySpark Training Repository.iml
generated
Normal file
10
.idea/PySpark Training Repository.iml
generated
Normal file
|
|
@ -0,0 +1,10 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<module type="PYTHON_MODULE" version="4">
|
||||
<component name="NewModuleRootManager">
|
||||
<content url="file://$MODULE_DIR$">
|
||||
<excludeFolder url="file://$MODULE_DIR$/venv" />
|
||||
</content>
|
||||
<orderEntry type="jdk" jdkName="PySpark_3_10" jdkType="Python SDK" />
|
||||
<orderEntry type="sourceFolder" forTests="false" />
|
||||
</component>
|
||||
</module>
|
||||
6
.idea/inspectionProfiles/profiles_settings.xml
generated
Normal file
6
.idea/inspectionProfiles/profiles_settings.xml
generated
Normal file
|
|
@ -0,0 +1,6 @@
|
|||
<component name="InspectionProjectProfileManager">
|
||||
<settings>
|
||||
<option name="USE_PROJECT_PROFILE" value="false" />
|
||||
<version value="1.0" />
|
||||
</settings>
|
||||
</component>
|
||||
7
.idea/misc.xml
generated
Normal file
7
.idea/misc.xml
generated
Normal file
|
|
@ -0,0 +1,7 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="Black">
|
||||
<option name="sdkName" value="PySpark" />
|
||||
</component>
|
||||
<component name="ProjectRootManager" version="2" project-jdk-name="PySpark_3_10" project-jdk-type="Python SDK" />
|
||||
</project>
|
||||
8
.idea/modules.xml
generated
Normal file
8
.idea/modules.xml
generated
Normal file
|
|
@ -0,0 +1,8 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="ProjectModuleManager">
|
||||
<modules>
|
||||
<module fileurl="file://$PROJECT_DIR$/.idea/pythonProject.iml" filepath="$PROJECT_DIR$/.idea/pythonProject.iml" />
|
||||
</modules>
|
||||
</component>
|
||||
</project>
|
||||
|
|
@ -12,4 +12,9 @@
|
|||
|
||||
## Run Python Test
|
||||
- path from src/test_pyspark_training
|
||||
- `pytest -k test_`
|
||||
- `pytest -k test_`
|
||||
|
||||
## Run pylint for code check
|
||||
|
||||
|
||||
## Run Python doc with Sphinx
|
||||
|
|
@ -0,0 +1,7 @@
|
|||
name,age
|
||||
Alice G.,13
|
||||
John B.,20
|
||||
Jack W.,19
|
||||
Bob T.,35
|
||||
John D.,9
|
||||
Eve A.,12
|
||||
|
11
init.py
11
init.py
|
|
@ -1,6 +1,7 @@
|
|||
import os
|
||||
import findspark
|
||||
from pyspark.sql import SparkSession
|
||||
from src.pyspark_training.output_dataset_1.compute_output_dataset_1 import compute_output_dataset_1
|
||||
|
||||
|
||||
def init_env():
|
||||
|
|
@ -12,17 +13,15 @@ def init_env():
|
|||
|
||||
|
||||
def init_spark():
|
||||
spark = SparkSession.builder.master("local[*]").getOrCreate()
|
||||
df = spark.createDataFrame([
|
||||
{'name': 'OUI OUI', 'age': 30},
|
||||
])
|
||||
df.show()
|
||||
return SparkSession.builder.master("local[*]").getOrCreate()
|
||||
|
||||
|
||||
def main():
|
||||
print("hey there")
|
||||
init_env()
|
||||
init_spark()
|
||||
spark_session = init_spark()
|
||||
|
||||
compute_output_dataset_1(spark_session)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
|
|
|||
20
src/Makefile
Normal file
20
src/Makefile
Normal file
|
|
@ -0,0 +1,20 @@
|
|||
# Minimal makefile for Sphinx documentation
|
||||
#
|
||||
|
||||
# You can set these variables from the command line, and also
|
||||
# from the environment for the first two.
|
||||
SPHINXOPTS ?=
|
||||
SPHINXBUILD ?= sphinx-build
|
||||
SOURCEDIR = .
|
||||
BUILDDIR = _build
|
||||
|
||||
# Put it first so that "make" without argument is like "make help".
|
||||
help:
|
||||
@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
|
||||
|
||||
.PHONY: help Makefile
|
||||
|
||||
# Catch-all target: route all unknown targets to Sphinx using the new
|
||||
# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
|
||||
%: Makefile
|
||||
@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
|
||||
56
src/conf.py
Normal file
56
src/conf.py
Normal file
|
|
@ -0,0 +1,56 @@
|
|||
# Configuration file for the Sphinx documentation builder.
|
||||
#
|
||||
# This file only contains a selection of the most common options. For a full
|
||||
# list see the documentation:
|
||||
# https://www.sphinx-doc.org/en/master/usage/configuration.html
|
||||
|
||||
# -- Path setup --------------------------------------------------------------
|
||||
|
||||
# If extensions (or modules to document with autodoc) are in another directory,
|
||||
# add these directories to sys.path here. If the directory is relative to the
|
||||
# documentation root, use os.path.abspath to make it absolute, like shown here.
|
||||
#
|
||||
# import os
|
||||
# import sys
|
||||
# sys.path.insert(0, os.path.abspath('.'))
|
||||
|
||||
|
||||
# -- Project information -----------------------------------------------------
|
||||
|
||||
project = 'PySpark Training Repository'
|
||||
copyright = '2024, Yûki VACHOT'
|
||||
author = 'Yûki VACHOT'
|
||||
|
||||
# The full version, including alpha/beta/rc tags
|
||||
release = '0.0.1'
|
||||
|
||||
|
||||
# -- General configuration ---------------------------------------------------
|
||||
|
||||
# Add any Sphinx extension module names here, as strings. They can be
|
||||
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
|
||||
# ones.
|
||||
extensions = [
|
||||
'sphinx.ext.autodoc',
|
||||
]
|
||||
|
||||
# Add any paths that contain templates here, relative to this directory.
|
||||
templates_path = ['_templates']
|
||||
|
||||
# List of patterns, relative to source directory, that match files and
|
||||
# directories to ignore when looking for source files.
|
||||
# This pattern also affects html_static_path and html_extra_path.
|
||||
exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
|
||||
|
||||
|
||||
# -- Options for HTML output -------------------------------------------------
|
||||
|
||||
# The theme to use for HTML and HTML Help pages. See the documentation for
|
||||
# a list of builtin themes.
|
||||
#
|
||||
html_theme = 'alabaster'
|
||||
|
||||
# Add any paths that contain custom static files (such as style sheets) here,
|
||||
# relative to this directory. They are copied after the builtin static files,
|
||||
# so a file named "default.css" will overwrite the builtin "default.css".
|
||||
html_static_path = ['_static']
|
||||
20
src/index.rst
Normal file
20
src/index.rst
Normal file
|
|
@ -0,0 +1,20 @@
|
|||
.. PySpark Training Repository documentation master file, created by
|
||||
sphinx-quickstart on Tue Jan 9 09:55:22 2024.
|
||||
You can adapt this file completely to your liking, but it should at least
|
||||
contain the root `toctree` directive.
|
||||
|
||||
Welcome to PySpark Training Repository's documentation!
|
||||
=======================================================
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 2
|
||||
:caption: Contents:
|
||||
|
||||
|
||||
|
||||
Indices and tables
|
||||
==================
|
||||
|
||||
* :ref:`genindex`
|
||||
* :ref:`modindex`
|
||||
* :ref:`search`
|
||||
35
src/make.bat
Normal file
35
src/make.bat
Normal file
|
|
@ -0,0 +1,35 @@
|
|||
@ECHO OFF
|
||||
|
||||
pushd %~dp0
|
||||
|
||||
REM Command file for Sphinx documentation
|
||||
|
||||
if "%SPHINXBUILD%" == "" (
|
||||
set SPHINXBUILD=sphinx-build
|
||||
)
|
||||
set SOURCEDIR=.
|
||||
set BUILDDIR=_build
|
||||
|
||||
%SPHINXBUILD% >NUL 2>NUL
|
||||
if errorlevel 9009 (
|
||||
echo.
|
||||
echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
|
||||
echo.installed, then set the SPHINXBUILD environment variable to point
|
||||
echo.to the full path of the 'sphinx-build' executable. Alternatively you
|
||||
echo.may add the Sphinx directory to PATH.
|
||||
echo.
|
||||
echo.If you don't have Sphinx installed, grab it from
|
||||
echo.https://www.sphinx-doc.org/
|
||||
exit /b 1
|
||||
)
|
||||
|
||||
if "%1" == "" goto help
|
||||
|
||||
%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
|
||||
goto end
|
||||
|
||||
:help
|
||||
%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
|
||||
|
||||
:end
|
||||
popd
|
||||
|
|
@ -1,18 +0,0 @@
|
|||
import pyspark.sql.functions as F
|
||||
from pyspark.sql import DataFrame
|
||||
|
||||
|
||||
def clean_output_dataset_1(df: DataFrame) -> DataFrame:
|
||||
"""
|
||||
|
||||
:param df:
|
||||
:return:
|
||||
"""
|
||||
df = remove_extra_spaces(df, 'name')
|
||||
|
||||
return df
|
||||
|
||||
|
||||
def remove_extra_spaces(df, column_name):
|
||||
df_transformed = df.withColumn(column_name, F.regexp_replace(F.col(column_name), "\\s+", " "))
|
||||
return df_transformed
|
||||
|
|
@ -0,0 +1,17 @@
|
|||
import pyspark.sql.functions as F
|
||||
from pyspark.sql import DataFrame
|
||||
from pyspark.sql.types import IntegerType
|
||||
|
||||
|
||||
def remove_extra_spaces(df: DataFrame, column_name: str) -> DataFrame:
|
||||
"""
|
||||
|
||||
:param df:
|
||||
:param column_name:
|
||||
:return:
|
||||
"""
|
||||
df_transformed = df.withColumn(
|
||||
column_name,
|
||||
F.regexp_replace(F.col(column_name), "\\s+", " ")
|
||||
)
|
||||
return df_transformed
|
||||
|
|
@ -1,31 +1,30 @@
|
|||
import pyspark.sql.functions as F
|
||||
from pyspark.sql import DataFrame
|
||||
from src.pyspark_training.output_dataset_1.clean_output_dataset_1 import clean_output_dataset_1
|
||||
from pyspark.sql import SparkSession
|
||||
from pyspark.sql.types import StructType, StructField, IntegerType, StringType
|
||||
|
||||
import src.pyspark_training.output_dataset_1.cleaning_output_dataset_1 as C
|
||||
import src.pyspark_training.output_dataset_1.processing_output_dataset_1 as P
|
||||
|
||||
INPUT_DATASET_1_PATH = './assets/output_dataset_1/raw/RAW_input_output_dataset_1.csv'
|
||||
OUTPUT_DATASET_1_PATH = './assets/output_dataset_1/output/OUTPUT_output_dataset_1.csv'
|
||||
|
||||
|
||||
def compute_output_dataset_1(df: DataFrame) -> DataFrame:
|
||||
|
||||
df = clean_output_dataset_1(df)
|
||||
|
||||
df = add_life_stage(df)
|
||||
|
||||
return df
|
||||
|
||||
|
||||
def add_life_stage(df: DataFrame) -> DataFrame:
|
||||
def compute_output_dataset_1(spark_session: SparkSession):
|
||||
"""
|
||||
Add life stage
|
||||
child if age < 13
|
||||
teenager if age >= 13 and <= 19
|
||||
adult for age>20
|
||||
:param df:
|
||||
Compute the output of output_dataset_1
|
||||
:param spark_session:
|
||||
:return:
|
||||
"""
|
||||
df = df.withColumn(
|
||||
'life_stage',
|
||||
F.when(F.col('age') < 13, F.lit('child'))
|
||||
.when(F.col('age').between(13, 19), F.lit('teenager'))
|
||||
.otherwise(F.lit('adult'))
|
||||
)
|
||||
df_schema = StructType([
|
||||
StructField('name', StringType(), False),
|
||||
StructField('age', IntegerType(), False)
|
||||
])
|
||||
df = spark_session.read.csv(INPUT_DATASET_1_PATH, header=True, schema=df_schema)
|
||||
|
||||
return df
|
||||
# Cleaning
|
||||
cleaned_df = C.remove_extra_spaces(df, 'name')
|
||||
|
||||
# Processing
|
||||
df = P.add_life_stage(cleaned_df)
|
||||
|
||||
df.show()
|
||||
df.write.mode('overwrite').csv(OUTPUT_DATASET_1_PATH)
|
||||
|
|
|
|||
|
|
@ -0,0 +1,35 @@
|
|||
import pyspark.sql.functions as F
|
||||
from pyspark.sql import DataFrame
|
||||
|
||||
|
||||
def add_life_stage(df: DataFrame) -> DataFrame:
|
||||
"""
|
||||
Add life stage
|
||||
child if age < 13
|
||||
teenager if age >= 13 and <= 19
|
||||
adult for age>20
|
||||
:param df:
|
||||
:return:
|
||||
"""
|
||||
df = df.withColumn(
|
||||
'life_stage',
|
||||
F.when(F.col('age') < 13, F.lit('child'))
|
||||
.when(F.col('age').between(13, 19), F.lit('teenager'))
|
||||
.when(F.col('age') >= 20, F.lit('adult'))
|
||||
)
|
||||
|
||||
return df
|
||||
|
||||
|
||||
def join_with_broadcast(big_df: DataFrame, smaller_df: DataFrame) -> DataFrame:
|
||||
"""
|
||||
Join big dataset and smaller dataset with broadcast
|
||||
:param big_df:
|
||||
:param smaller_df:
|
||||
:return:
|
||||
"""
|
||||
df = big_df.join(
|
||||
F.broadcast(smaller_df)
|
||||
)
|
||||
|
||||
return df
|
||||
|
|
@ -1,6 +1,6 @@
|
|||
from pyspark.sql import types as T
|
||||
from src.test_pyspark_training.lib_test_utils import assert_df_equal
|
||||
from src.pyspark_training.output_dataset_1.clean_output_dataset_1 import remove_extra_spaces
|
||||
from src.pyspark_training.output_dataset_1.cleaning_output_dataset_1 import remove_extra_spaces
|
||||
|
||||
|
||||
def test_remove_extra_spaces(spark_session):
|
||||
|
|
@ -1,6 +1,6 @@
|
|||
from pyspark.sql import types as T
|
||||
from src.test_pyspark_training.lib_test_utils import assert_df_equal
|
||||
from src.pyspark_training.output_dataset_1.compute_output_dataset_1 import add_life_stage
|
||||
from src.pyspark_training.output_dataset_1.processing_output_dataset_1 import add_life_stage
|
||||
|
||||
|
||||
def test_add_life_stage(spark_session):
|
||||
|
|
@ -8,7 +8,7 @@ def test_add_life_stage(spark_session):
|
|||
input_schema = T.StructType(
|
||||
[
|
||||
T.StructField('name', T.StringType(), False),
|
||||
T.StructField('age', T.IntegerType(), False),
|
||||
T.StructField('age', T.IntegerType(), True),
|
||||
]
|
||||
)
|
||||
input_data = [
|
||||
|
|
@ -18,14 +18,15 @@ def test_add_life_stage(spark_session):
|
|||
('Bob T.', 35),
|
||||
('John D.', 9),
|
||||
('Eve A.', 12),
|
||||
('Eve B.', None),
|
||||
]
|
||||
input_df = spark_session.createDataFrame(input_data, input_schema)
|
||||
|
||||
expected_schema = T.StructType(
|
||||
[
|
||||
T.StructField('name', T.StringType(), False),
|
||||
T.StructField('age', T.IntegerType(), False),
|
||||
T.StructField('life_stage', T.StringType(), False),
|
||||
T.StructField('age', T.IntegerType(), True),
|
||||
T.StructField('life_stage', T.StringType(), True),
|
||||
]
|
||||
)
|
||||
expected_data = [
|
||||
|
|
@ -35,6 +36,7 @@ def test_add_life_stage(spark_session):
|
|||
('Bob T.', 35, 'adult'),
|
||||
('John D.', 9, 'child'),
|
||||
('Eve A.', 12, 'child'),
|
||||
('Eve B.', None, None),
|
||||
]
|
||||
expected_df = spark_session.createDataFrame(expected_data, expected_schema)
|
||||
|
||||
Loading…
Add table
Add a link
Reference in a new issue