Fix lib_test_utils + Added OS Env for PYSPARK & added libraries in README

This commit is contained in:
Yûki VACHOT 2024-01-08 10:04:47 +01:00
parent 1f9509c574
commit b22ebc40fe
4 changed files with 19 additions and 8 deletions

View file

@ -3,6 +3,8 @@
## Installation
- [Python 3.10](https://www.python.org/downloads/)
- pyspark=3.1.1
- findspark
- pyspark-test
- [Spark 3.1.1](https://spark.apache.org/downloads.html)
- [Hadoop 3.3.6](https://hadoop.apache.org/releases.html)
- [Java JDK 11](https://www.oracle.com/java/technologies/downloads/#java11)

View file

@ -1,4 +1,5 @@
import os
import sys
import findspark
import logging
import pytest
@ -7,9 +8,17 @@ from pyspark.sql import SparkSession
@pytest.fixture
def spark_session(request):
"""
Return a Spark Session
:param request:
:return: Spark session
"""
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable
os.environ["JAVA_HOME"] = "C:\\Program Files\\Java\\jdk-11"
os.environ["SPARK_HOME"] = "C:\\SPARK\\spark-3.1.1-bin-hadoop3.2"
os.environ["HADOOP_HOME"] = "C:\\SPARK\\hadoop"
findspark.init()
spark = SparkSession.builder.master("local[*]").getOrCreate()

View file

@ -1,15 +1,16 @@
from pyspark.sql import DataFrame
from pyspark_test import assert_pyspark_df_equal
def assert_df_equal(df1, df2):
def assert_df_equal(actual_df: DataFrame, expected_df: DataFrame) -> None:
try:
assert df1.schema == df2.schema
assert actual_df.schema == expected_df.schema
except AssertionError:
print('Error Schema')
print('df1\n')
df1.printSchema()
print('df2\n')
df2.printSchema()
print('Actual :\n')
actual_df.printSchema()
print('Expected :\n')
expected_df.printSchema()
assert_pyspark_df_equal(df1, df2)
assert_pyspark_df_equal(actual_df, expected_df)

View file

@ -19,7 +19,6 @@ def test_remove_extra_spaces(spark_session):
]
input_df = spark_session.createDataFrame(input_data, input_schema)
expected_schema = T.StructType(
[
T.StructField('name', T.StringType(), False),