From b22ebc40fe18026fe3c84a692be54c033ab2515a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Y=C3=BBki=20VACHOT?= Date: Mon, 8 Jan 2024 10:04:47 +0100 Subject: [PATCH] Fix lib_test_utils + Added OS Env for PYSPARK & added libraries in README --- README.md | 2 ++ src/test_pyspark_training/conftest.py | 9 +++++++++ src/test_pyspark_training/lib_test_utils.py | 15 ++++++++------- .../test_remove_extra_spaces.py | 1 - 4 files changed, 19 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 6c6efcc..217e34f 100644 --- a/README.md +++ b/README.md @@ -3,6 +3,8 @@ ## Installation - [Python 3.10](https://www.python.org/downloads/) - pyspark=3.1.1 + - findspark + - pyspark-test - [Spark 3.1.1](https://spark.apache.org/downloads.html) - [Hadoop 3.3.6](https://hadoop.apache.org/releases.html) - [Java JDK 11](https://www.oracle.com/java/technologies/downloads/#java11) diff --git a/src/test_pyspark_training/conftest.py b/src/test_pyspark_training/conftest.py index a21fbac..e0a5155 100644 --- a/src/test_pyspark_training/conftest.py +++ b/src/test_pyspark_training/conftest.py @@ -1,4 +1,5 @@ import os +import sys import findspark import logging import pytest @@ -7,9 +8,17 @@ from pyspark.sql import SparkSession @pytest.fixture def spark_session(request): + """ + Return a Spark Session + :param request: + :return: Spark session + """ + os.environ['PYSPARK_PYTHON'] = sys.executable + os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable os.environ["JAVA_HOME"] = "C:\\Program Files\\Java\\jdk-11" os.environ["SPARK_HOME"] = "C:\\SPARK\\spark-3.1.1-bin-hadoop3.2" os.environ["HADOOP_HOME"] = "C:\\SPARK\\hadoop" + findspark.init() spark = SparkSession.builder.master("local[*]").getOrCreate() diff --git a/src/test_pyspark_training/lib_test_utils.py b/src/test_pyspark_training/lib_test_utils.py index c22b46a..ed5af1f 100644 --- a/src/test_pyspark_training/lib_test_utils.py +++ b/src/test_pyspark_training/lib_test_utils.py @@ -1,15 +1,16 @@ +from pyspark.sql import DataFrame from pyspark_test import assert_pyspark_df_equal -def assert_df_equal(df1, df2): +def assert_df_equal(actual_df: DataFrame, expected_df: DataFrame) -> None: try: - assert df1.schema == df2.schema + assert actual_df.schema == expected_df.schema except AssertionError: print('Error Schema') - print('df1\n') - df1.printSchema() - print('df2\n') - df2.printSchema() + print('Actual :\n') + actual_df.printSchema() + print('Expected :\n') + expected_df.printSchema() - assert_pyspark_df_equal(df1, df2) + assert_pyspark_df_equal(actual_df, expected_df) diff --git a/src/test_pyspark_training/test_output_dataset_1/test_remove_extra_spaces/test_remove_extra_spaces.py b/src/test_pyspark_training/test_output_dataset_1/test_remove_extra_spaces/test_remove_extra_spaces.py index 883bbf3..717dd33 100644 --- a/src/test_pyspark_training/test_output_dataset_1/test_remove_extra_spaces/test_remove_extra_spaces.py +++ b/src/test_pyspark_training/test_output_dataset_1/test_remove_extra_spaces/test_remove_extra_spaces.py @@ -19,7 +19,6 @@ def test_remove_extra_spaces(spark_session): ] input_df = spark_session.createDataFrame(input_data, input_schema) - expected_schema = T.StructType( [ T.StructField('name', T.StringType(), False),