From c4fdb2860c74090ff86f4873833029263d9ec9ca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Y=C3=BBki=20VACHOT?= Date: Fri, 5 Jan 2024 13:05:39 +0100 Subject: [PATCH] Init --- .gitignore | 1 + init.py | 39 +++++++++++++++++++ src/__init__.py | 0 src/pyspark_training/__init__.py | 0 .../output_dataset_1/__init__.py | 0 .../output_dataset_1/remove_extra_space.py | 3 ++ src/pyspark_training/utils.py | 0 src/test_pyspark_training/__init__.py | 0 src/test_pyspark_training/example_test.py | 3 ++ src/test_pyspark_training/lib_test_utils.py | 16 ++++++++ .../test_output_dataset_1/__init__.py | 0 .../test_remove_extra_space/__init__.py | 0 .../test_remove_extra_space.py | 0 src/test_pyspark_training/utils/__init__.py | 0 14 files changed, 62 insertions(+) create mode 100644 .gitignore create mode 100644 init.py create mode 100644 src/__init__.py create mode 100644 src/pyspark_training/__init__.py create mode 100644 src/pyspark_training/output_dataset_1/__init__.py create mode 100644 src/pyspark_training/output_dataset_1/remove_extra_space.py create mode 100644 src/pyspark_training/utils.py create mode 100644 src/test_pyspark_training/__init__.py create mode 100644 src/test_pyspark_training/example_test.py create mode 100644 src/test_pyspark_training/lib_test_utils.py create mode 100644 src/test_pyspark_training/test_output_dataset_1/__init__.py create mode 100644 src/test_pyspark_training/test_output_dataset_1/test_remove_extra_space/__init__.py create mode 100644 src/test_pyspark_training/test_output_dataset_1/test_remove_extra_space/test_remove_extra_space.py create mode 100644 src/test_pyspark_training/utils/__init__.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..757fee3 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +/.idea \ No newline at end of file diff --git a/init.py b/init.py new file mode 100644 index 0000000..e03871a --- /dev/null +++ b/init.py @@ -0,0 +1,39 @@ +import os +import findspark +from pyspark.sql import SparkSession +import pyspark.sql.functions as F + +spark = SparkSession.builder.master("local[*]").getOrCreate() + + +sample_data = [ + {"name": "John D.", "age": 30}, + {"name": "Alice G.", "age": 25}, + {"name": "Bob T.", "age": 35}, + {"name": "Eve A.", "age": 28} +] +df = spark.createDataFrame(sample_data) + + + + + +transformed_df = remove_extra_spaces(df, "name") +transformed_df.show() + + +def main(): + init_env() + print("hey there") + + +if __name__ == "__main__": + main() + + +def init_env(): + os.environ["JAVA_HOME"] = "C:\\Program Files\\Java\\jdk-11" + os.environ["SPARK_HOME"] = "C:\\SPARK\\spark-3.1.1-bin-hadoop3.2" + os.environ["HADOOP_HOME"] = "C:\\SPARK\\hadoop" + + findspark.init() \ No newline at end of file diff --git a/src/__init__.py b/src/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/pyspark_training/__init__.py b/src/pyspark_training/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/pyspark_training/output_dataset_1/__init__.py b/src/pyspark_training/output_dataset_1/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/pyspark_training/output_dataset_1/remove_extra_space.py b/src/pyspark_training/output_dataset_1/remove_extra_space.py new file mode 100644 index 0000000..0952e86 --- /dev/null +++ b/src/pyspark_training/output_dataset_1/remove_extra_space.py @@ -0,0 +1,3 @@ +def remove_extra_spaces(df, column_name): + df_transformed = df.withColumn(column_name, F.regexp_replace(F.col(column_name), "\\s+", " ")) + return df_transformed \ No newline at end of file diff --git a/src/pyspark_training/utils.py b/src/pyspark_training/utils.py new file mode 100644 index 0000000..e69de29 diff --git a/src/test_pyspark_training/__init__.py b/src/test_pyspark_training/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/test_pyspark_training/example_test.py b/src/test_pyspark_training/example_test.py new file mode 100644 index 0000000..0255f62 --- /dev/null +++ b/src/test_pyspark_training/example_test.py @@ -0,0 +1,3 @@ + + +def test_example_test(): diff --git a/src/test_pyspark_training/lib_test_utils.py b/src/test_pyspark_training/lib_test_utils.py new file mode 100644 index 0000000..06a01c5 --- /dev/null +++ b/src/test_pyspark_training/lib_test_utils.py @@ -0,0 +1,16 @@ + +def assert_df_equal(df1, df2): + + try: + assert df1.schema() == df2.schema() + except AssertionError: + print('Error Schema') + print(df1.schema()) + print(df1.schema()) + + try: + assert df1.equals(df2) + except AssertionError: + print('Error Schema') + df1.show() + df2.show() diff --git a/src/test_pyspark_training/test_output_dataset_1/__init__.py b/src/test_pyspark_training/test_output_dataset_1/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/test_pyspark_training/test_output_dataset_1/test_remove_extra_space/__init__.py b/src/test_pyspark_training/test_output_dataset_1/test_remove_extra_space/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/test_pyspark_training/test_output_dataset_1/test_remove_extra_space/test_remove_extra_space.py b/src/test_pyspark_training/test_output_dataset_1/test_remove_extra_space/test_remove_extra_space.py new file mode 100644 index 0000000..e69de29 diff --git a/src/test_pyspark_training/utils/__init__.py b/src/test_pyspark_training/utils/__init__.py new file mode 100644 index 0000000..e69de29