commit c4fdb2860c74090ff86f4873833029263d9ec9ca Author: Yûki VACHOT Date: Fri Jan 5 13:05:39 2024 +0100 Init diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..757fee3 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +/.idea \ No newline at end of file diff --git a/init.py b/init.py new file mode 100644 index 0000000..e03871a --- /dev/null +++ b/init.py @@ -0,0 +1,39 @@ +import os +import findspark +from pyspark.sql import SparkSession +import pyspark.sql.functions as F + +spark = SparkSession.builder.master("local[*]").getOrCreate() + + +sample_data = [ + {"name": "John D.", "age": 30}, + {"name": "Alice G.", "age": 25}, + {"name": "Bob T.", "age": 35}, + {"name": "Eve A.", "age": 28} +] +df = spark.createDataFrame(sample_data) + + + + + +transformed_df = remove_extra_spaces(df, "name") +transformed_df.show() + + +def main(): + init_env() + print("hey there") + + +if __name__ == "__main__": + main() + + +def init_env(): + os.environ["JAVA_HOME"] = "C:\\Program Files\\Java\\jdk-11" + os.environ["SPARK_HOME"] = "C:\\SPARK\\spark-3.1.1-bin-hadoop3.2" + os.environ["HADOOP_HOME"] = "C:\\SPARK\\hadoop" + + findspark.init() \ No newline at end of file diff --git a/src/__init__.py b/src/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/pyspark_training/__init__.py b/src/pyspark_training/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/pyspark_training/output_dataset_1/__init__.py b/src/pyspark_training/output_dataset_1/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/pyspark_training/output_dataset_1/remove_extra_space.py b/src/pyspark_training/output_dataset_1/remove_extra_space.py new file mode 100644 index 0000000..0952e86 --- /dev/null +++ b/src/pyspark_training/output_dataset_1/remove_extra_space.py @@ -0,0 +1,3 @@ +def remove_extra_spaces(df, column_name): + df_transformed = df.withColumn(column_name, F.regexp_replace(F.col(column_name), "\\s+", " ")) + return df_transformed \ No newline at end of file diff --git a/src/pyspark_training/utils.py b/src/pyspark_training/utils.py new file mode 100644 index 0000000..e69de29 diff --git a/src/test_pyspark_training/__init__.py b/src/test_pyspark_training/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/test_pyspark_training/example_test.py b/src/test_pyspark_training/example_test.py new file mode 100644 index 0000000..0255f62 --- /dev/null +++ b/src/test_pyspark_training/example_test.py @@ -0,0 +1,3 @@ + + +def test_example_test(): diff --git a/src/test_pyspark_training/lib_test_utils.py b/src/test_pyspark_training/lib_test_utils.py new file mode 100644 index 0000000..06a01c5 --- /dev/null +++ b/src/test_pyspark_training/lib_test_utils.py @@ -0,0 +1,16 @@ + +def assert_df_equal(df1, df2): + + try: + assert df1.schema() == df2.schema() + except AssertionError: + print('Error Schema') + print(df1.schema()) + print(df1.schema()) + + try: + assert df1.equals(df2) + except AssertionError: + print('Error Schema') + df1.show() + df2.show() diff --git a/src/test_pyspark_training/test_output_dataset_1/__init__.py b/src/test_pyspark_training/test_output_dataset_1/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/test_pyspark_training/test_output_dataset_1/test_remove_extra_space/__init__.py b/src/test_pyspark_training/test_output_dataset_1/test_remove_extra_space/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/test_pyspark_training/test_output_dataset_1/test_remove_extra_space/test_remove_extra_space.py b/src/test_pyspark_training/test_output_dataset_1/test_remove_extra_space/test_remove_extra_space.py new file mode 100644 index 0000000..e69de29 diff --git a/src/test_pyspark_training/utils/__init__.py b/src/test_pyspark_training/utils/__init__.py new file mode 100644 index 0000000..e69de29