Add RAW data with new architecture

2024-01-09 17:11:23 +01:00 · 2024-01-09 17:11:23 +01:00 · 1e42b00ce7
commit 1e42b00ce7
parent 87f4dfdb0b
21 changed files with 272 additions and 55 deletions
--- a/.idea/.gitignore
+++ b/.idea/.gitignore
@ -0,0 +1,8 @@
+# Default ignored files
+/shelf/
+/workspace.xml
+# Editor-based HTTP Client requests
+/httpRequests/
+# Datasource local storage ignored files
+/dataSources/
+/dataSources.local.xml
--- a/.idea/.name
+++ b/.idea/.name
@ -0,0 +1 @@
+PySpark Training Repository
--- a/Repository.iml
+++ b/Repository.iml
@ -0,0 +1,10 @@
+    <?xml version="1.0" encoding="UTF-8"?>
+<module type="PYTHON_MODULE" version="4">
+  <component name="NewModuleRootManager">
+    <content url="file://$MODULE_DIR$">
+      <excludeFolder url="file://$MODULE_DIR$/venv" />
+    </content>
+    <orderEntry type="jdk" jdkName="PySpark_3_10" jdkType="Python SDK" />
+    <orderEntry type="sourceFolder" forTests="false" />
+  </component>
+</module>
--- a/.idea/inspectionProfiles/profiles_settings.xml
+++ b/.idea/inspectionProfiles/profiles_settings.xml
@ -0,0 +1,6 @@
+<component name="InspectionProjectProfileManager">
+  <settings>
+    <option name="USE_PROJECT_PROFILE" value="false" />
+    <version value="1.0" />
+  </settings>
+</component>
--- a/.idea/misc.xml
+++ b/.idea/misc.xml
@ -0,0 +1,7 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="Black">
+    <option name="sdkName" value="PySpark" />
+  </component>
+  <component name="ProjectRootManager" version="2" project-jdk-name="PySpark_3_10" project-jdk-type="Python SDK" />
+</project>
--- a/.idea/modules.xml
+++ b/.idea/modules.xml
@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectModuleManager">
+    <modules>
+      <module fileurl="file://$PROJECT_DIR$/.idea/pythonProject.iml" filepath="$PROJECT_DIR$/.idea/pythonProject.iml" />
+    </modules>
+  </component>
+</project>
--- a/README.md
+++ b/README.md
@ -12,4 +12,9 @@

 ## Run Python Test
   - path from src/test_pyspark_training
-   - `pytest -k test_`
+   - `pytest -k test_`
+
+## Run pylint for code check
+     
+
+## Run Python doc with Sphinx
--- a/assets/output_dataset_1/raw/RAW_input_output_dataset_1.csv
+++ b/assets/output_dataset_1/raw/RAW_input_output_dataset_1.csv
@ -0,0 +1,7 @@
+name,age
+Alice G.,13
+John   B.,20
+Jack    W.,19
+Bob T.,35
+John    D.,9
+Eve   A.,12
--- a/init.py
+++ b/init.py
@ -1,6 +1,7 @@
 import os
 import findspark
 from pyspark.sql import SparkSession
+from src.pyspark_training.output_dataset_1.compute_output_dataset_1 import compute_output_dataset_1


 def init_env():
@ -12,17 +13,15 @@ def init_env():


 def init_spark():
-    spark = SparkSession.builder.master("local[*]").getOrCreate()
-    df = spark.createDataFrame([
-        {'name': 'OUI OUI', 'age': 30},
-    ])
-    df.show()
+    return SparkSession.builder.master("local[*]").getOrCreate()


 def main():
    print("hey there")
    init_env()
-    init_spark()
+    spark_session = init_spark()
+
+    compute_output_dataset_1(spark_session)


 if __name__ == "__main__":
--- a/src/Makefile
+++ b/src/Makefile
@ -0,0 +1,20 @@
+# Minimal makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line, and also
+# from the environment for the first two.
+SPHINXOPTS    ?=
+SPHINXBUILD   ?= sphinx-build
+SOURCEDIR     = .
+BUILDDIR      = _build
+
+# Put it first so that "make" without argument is like "make help".
+help:
+	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+.PHONY: help Makefile
+
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
--- a/src/conf.py
+++ b/src/conf.py
@ -0,0 +1,56 @@
+# Configuration file for the Sphinx documentation builder.
+#
+# This file only contains a selection of the most common options. For a full
+# list see the documentation:
+# https://www.sphinx-doc.org/en/master/usage/configuration.html
+
+# -- Path setup --------------------------------------------------------------
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+#
+# import os
+# import sys
+# sys.path.insert(0, os.path.abspath('.'))
+
+
+# -- Project information -----------------------------------------------------
+
+project = 'PySpark Training Repository'
+copyright = '2024, Yûki VACHOT'
+author = 'Yûki VACHOT'
+
+# The full version, including alpha/beta/rc tags
+release = '0.0.1'
+
+
+# -- General configuration ---------------------------------------------------
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+extensions = [
+    'sphinx.ext.autodoc',
+]
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['_templates']
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+# This pattern also affects html_static_path and html_extra_path.
+exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
+
+
+# -- Options for HTML output -------------------------------------------------
+
+# The theme to use for HTML and HTML Help pages.  See the documentation for
+# a list of builtin themes.
+#
+html_theme = 'alabaster'
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ['_static']
--- a/src/index.rst
+++ b/src/index.rst
@ -0,0 +1,20 @@
+.. PySpark Training Repository documentation master file, created by
+   sphinx-quickstart on Tue Jan  9 09:55:22 2024.
+   You can adapt this file completely to your liking, but it should at least
+   contain the root `toctree` directive.
+
+Welcome to PySpark Training Repository's documentation!
+=======================================================
+
+.. toctree::
+   :maxdepth: 2
+   :caption: Contents:
+
+
+
+Indices and tables
+==================
+
+* :ref:`genindex`
+* :ref:`modindex`
+* :ref:`search`
--- a/src/make.bat
+++ b/src/make.bat
@ -0,0 +1,35 @@
+@ECHO OFF
+
+pushd %~dp0
+
+REM Command file for Sphinx documentation
+
+if "%SPHINXBUILD%" == "" (
+	set SPHINXBUILD=sphinx-build
+)
+set SOURCEDIR=.
+set BUILDDIR=_build
+
+%SPHINXBUILD% >NUL 2>NUL
+if errorlevel 9009 (
+	echo.
+	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
+	echo.installed, then set the SPHINXBUILD environment variable to point
+	echo.to the full path of the 'sphinx-build' executable. Alternatively you
+	echo.may add the Sphinx directory to PATH.
+	echo.
+	echo.If you don't have Sphinx installed, grab it from
+	echo.https://www.sphinx-doc.org/
+	exit /b 1
+)
+
+if "%1" == "" goto help
+
+%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+goto end
+
+:help
+%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+
+:end
+popd
--- a/src/pyspark_training/output_dataset_1/clean_output_dataset_1.py
+++ b/src/pyspark_training/output_dataset_1/clean_output_dataset_1.py
@ -1,18 +0,0 @@
-import pyspark.sql.functions as F
-from pyspark.sql import DataFrame
-
-
-def clean_output_dataset_1(df: DataFrame) -> DataFrame:
-    """
-
-    :param df:
-    :return:
-    """
-    df = remove_extra_spaces(df, 'name')
-
-    return df
-
-
-def remove_extra_spaces(df, column_name):
-    df_transformed = df.withColumn(column_name, F.regexp_replace(F.col(column_name), "\\s+", " "))
-    return df_transformed
--- a/src/pyspark_training/output_dataset_1/cleaning_output_dataset_1.py
+++ b/src/pyspark_training/output_dataset_1/cleaning_output_dataset_1.py
@ -0,0 +1,17 @@
+import pyspark.sql.functions as F
+from pyspark.sql import DataFrame
+from pyspark.sql.types import IntegerType
+
+
+def remove_extra_spaces(df: DataFrame, column_name: str) -> DataFrame:
+    """
+
+    :param df:
+    :param column_name:
+    :return:
+    """
+    df_transformed = df.withColumn(
+        column_name,
+        F.regexp_replace(F.col(column_name), "\\s+", " ")
+    )
+    return df_transformed
--- a/src/pyspark_training/output_dataset_1/compute_output_dataset_1.py
+++ b/src/pyspark_training/output_dataset_1/compute_output_dataset_1.py
@ -1,31 +1,30 @@
-import pyspark.sql.functions as F
-from pyspark.sql import DataFrame
-from src.pyspark_training.output_dataset_1.clean_output_dataset_1 import clean_output_dataset_1
+from pyspark.sql import SparkSession
+from pyspark.sql.types import StructType, StructField, IntegerType, StringType
+
+import src.pyspark_training.output_dataset_1.cleaning_output_dataset_1 as C
+import src.pyspark_training.output_dataset_1.processing_output_dataset_1 as P
+
+INPUT_DATASET_1_PATH = './assets/output_dataset_1/raw/RAW_input_output_dataset_1.csv'
+OUTPUT_DATASET_1_PATH = './assets/output_dataset_1/output/OUTPUT_output_dataset_1.csv'


-def compute_output_dataset_1(df: DataFrame) -> DataFrame:
-
-    df = clean_output_dataset_1(df)
-
-    df = add_life_stage(df)
-
-    return df
-
-
-def add_life_stage(df: DataFrame) -> DataFrame:
+def compute_output_dataset_1(spark_session: SparkSession):
    """
-    Add life stage
-        child if age < 13
-        teenager if age >= 13 and <= 19
-        adult for age>20
-    :param df:
+    Compute the output of output_dataset_1
+    :param spark_session:
    :return:
    """
-    df = df.withColumn(
-        'life_stage',
-        F.when(F.col('age') < 13, F.lit('child'))
-        .when(F.col('age').between(13, 19), F.lit('teenager'))
-        .otherwise(F.lit('adult'))
-    )
+    df_schema = StructType([
+        StructField('name', StringType(), False),
+        StructField('age', IntegerType(), False)
+    ])
+    df = spark_session.read.csv(INPUT_DATASET_1_PATH, header=True, schema=df_schema)

-    return df
+    # Cleaning
+    cleaned_df = C.remove_extra_spaces(df, 'name')
+
+    # Processing
+    df = P.add_life_stage(cleaned_df)
+
+    df.show()
+    df.write.mode('overwrite').csv(OUTPUT_DATASET_1_PATH)
--- a/src/pyspark_training/output_dataset_1/processing_output_dataset_1.py
+++ b/src/pyspark_training/output_dataset_1/processing_output_dataset_1.py
@ -0,0 +1,35 @@
+import pyspark.sql.functions as F
+from pyspark.sql import DataFrame
+
+
+def add_life_stage(df: DataFrame) -> DataFrame:
+    """
+    Add life stage
+        child if age < 13
+        teenager if age >= 13 and <= 19
+        adult for age>20
+    :param df:
+    :return:
+    """
+    df = df.withColumn(
+        'life_stage',
+        F.when(F.col('age') < 13, F.lit('child'))
+        .when(F.col('age').between(13, 19), F.lit('teenager'))
+        .when(F.col('age') >= 20, F.lit('adult'))
+    )
+
+    return df
+
+
+def join_with_broadcast(big_df: DataFrame, smaller_df: DataFrame) -> DataFrame:
+    """
+    Join big dataset and smaller dataset with broadcast
+    :param big_df:
+    :param smaller_df:
+    :return:
+    """
+    df = big_df.join(
+        F.broadcast(smaller_df)
+    )
+
+    return df
--- a/src/test_pyspark_training/test_output_dataset_1/test_cleaning_output_dataset_1/init.py
+++ b/src/test_pyspark_training/test_output_dataset_1/test_cleaning_output_dataset_1/init.py
--- a/src/test_pyspark_training/test_output_dataset_1/test_cleaning_output_dataset_1/test_remove_extra_spaces.py
+++ b/src/test_pyspark_training/test_output_dataset_1/test_cleaning_output_dataset_1/test_remove_extra_spaces.py
@ -1,6 +1,6 @@
 from pyspark.sql import types as T
 from src.test_pyspark_training.lib_test_utils import assert_df_equal
-from src.pyspark_training.output_dataset_1.clean_output_dataset_1 import remove_extra_spaces
+from src.pyspark_training.output_dataset_1.cleaning_output_dataset_1 import remove_extra_spaces


 def test_remove_extra_spaces(spark_session):
--- a/src/test_pyspark_training/test_output_dataset_1/test_processing_output_dataset_1/init.py
+++ b/src/test_pyspark_training/test_output_dataset_1/test_processing_output_dataset_1/init.py
--- a/src/test_pyspark_training/test_output_dataset_1/test_processing_output_dataset_1/test_add_life_stage.py
+++ b/src/test_pyspark_training/test_output_dataset_1/test_processing_output_dataset_1/test_add_life_stage.py
@ -1,6 +1,6 @@
 from pyspark.sql import types as T
 from src.test_pyspark_training.lib_test_utils import assert_df_equal
-from src.pyspark_training.output_dataset_1.compute_output_dataset_1 import add_life_stage
+from src.pyspark_training.output_dataset_1.processing_output_dataset_1 import add_life_stage


 def test_add_life_stage(spark_session):
@ -8,7 +8,7 @@ def test_add_life_stage(spark_session):
    input_schema = T.StructType(
        [
            T.StructField('name', T.StringType(), False),
-            T.StructField('age', T.IntegerType(), False),
+            T.StructField('age', T.IntegerType(), True),
        ]
    )
    input_data = [
@ -18,14 +18,15 @@ def test_add_life_stage(spark_session):
        ('Bob T.', 35),
        ('John D.', 9),
        ('Eve A.', 12),
+        ('Eve B.', None),
    ]
    input_df = spark_session.createDataFrame(input_data, input_schema)

    expected_schema = T.StructType(
        [
            T.StructField('name', T.StringType(), False),
-            T.StructField('age', T.IntegerType(), False),
-            T.StructField('life_stage', T.StringType(), False),
+            T.StructField('age', T.IntegerType(), True),
+            T.StructField('life_stage', T.StringType(), True),
        ]
    )
    expected_data = [
@ -35,6 +36,7 @@ def test_add_life_stage(spark_session):
        ('Bob T.', 35, 'adult'),
        ('John D.', 9, 'child'),
        ('Eve A.', 12, 'child'),
+        ('Eve B.', None, None),
    ]
    expected_df = spark_session.createDataFrame(expected_data, expected_schema)