Fix lib_test_utils + Added OS Env for PYSPARK & added libraries in README

This commit is contained in:
Yûki VACHOT 2024-01-08 10:04:47 +01:00
parent 1f9509c574
commit b22ebc40fe
4 changed files with 19 additions and 8 deletions

View file

@ -3,6 +3,8 @@
## Installation ## Installation
- [Python 3.10](https://www.python.org/downloads/) - [Python 3.10](https://www.python.org/downloads/)
- pyspark=3.1.1 - pyspark=3.1.1
- findspark
- pyspark-test
- [Spark 3.1.1](https://spark.apache.org/downloads.html) - [Spark 3.1.1](https://spark.apache.org/downloads.html)
- [Hadoop 3.3.6](https://hadoop.apache.org/releases.html) - [Hadoop 3.3.6](https://hadoop.apache.org/releases.html)
- [Java JDK 11](https://www.oracle.com/java/technologies/downloads/#java11) - [Java JDK 11](https://www.oracle.com/java/technologies/downloads/#java11)

View file

@ -1,4 +1,5 @@
import os import os
import sys
import findspark import findspark
import logging import logging
import pytest import pytest
@ -7,9 +8,17 @@ from pyspark.sql import SparkSession
@pytest.fixture @pytest.fixture
def spark_session(request): def spark_session(request):
"""
Return a Spark Session
:param request:
:return: Spark session
"""
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable
os.environ["JAVA_HOME"] = "C:\\Program Files\\Java\\jdk-11" os.environ["JAVA_HOME"] = "C:\\Program Files\\Java\\jdk-11"
os.environ["SPARK_HOME"] = "C:\\SPARK\\spark-3.1.1-bin-hadoop3.2" os.environ["SPARK_HOME"] = "C:\\SPARK\\spark-3.1.1-bin-hadoop3.2"
os.environ["HADOOP_HOME"] = "C:\\SPARK\\hadoop" os.environ["HADOOP_HOME"] = "C:\\SPARK\\hadoop"
findspark.init() findspark.init()
spark = SparkSession.builder.master("local[*]").getOrCreate() spark = SparkSession.builder.master("local[*]").getOrCreate()

View file

@ -1,15 +1,16 @@
from pyspark.sql import DataFrame
from pyspark_test import assert_pyspark_df_equal from pyspark_test import assert_pyspark_df_equal
def assert_df_equal(df1, df2): def assert_df_equal(actual_df: DataFrame, expected_df: DataFrame) -> None:
try: try:
assert df1.schema == df2.schema assert actual_df.schema == expected_df.schema
except AssertionError: except AssertionError:
print('Error Schema') print('Error Schema')
print('df1\n') print('Actual :\n')
df1.printSchema() actual_df.printSchema()
print('df2\n') print('Expected :\n')
df2.printSchema() expected_df.printSchema()
assert_pyspark_df_equal(df1, df2) assert_pyspark_df_equal(actual_df, expected_df)

View file

@ -19,7 +19,6 @@ def test_remove_extra_spaces(spark_session):
] ]
input_df = spark_session.createDataFrame(input_data, input_schema) input_df = spark_session.createDataFrame(input_data, input_schema)
expected_schema = T.StructType( expected_schema = T.StructType(
[ [
T.StructField('name', T.StringType(), False), T.StructField('name', T.StringType(), False),