This commit is contained in:
Yûki VACHOT 2024-01-05 13:05:39 +01:00
commit c4fdb2860c
14 changed files with 62 additions and 0 deletions

1
.gitignore vendored Normal file
View file

@ -0,0 +1 @@
/.idea

39
init.py Normal file
View file

@ -0,0 +1,39 @@
import os
import findspark
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
spark = SparkSession.builder.master("local[*]").getOrCreate()
sample_data = [
{"name": "John D.", "age": 30},
{"name": "Alice G.", "age": 25},
{"name": "Bob T.", "age": 35},
{"name": "Eve A.", "age": 28}
]
df = spark.createDataFrame(sample_data)
transformed_df = remove_extra_spaces(df, "name")
transformed_df.show()
def main():
init_env()
print("hey there")
if __name__ == "__main__":
main()
def init_env():
os.environ["JAVA_HOME"] = "C:\\Program Files\\Java\\jdk-11"
os.environ["SPARK_HOME"] = "C:\\SPARK\\spark-3.1.1-bin-hadoop3.2"
os.environ["HADOOP_HOME"] = "C:\\SPARK\\hadoop"
findspark.init()

0
src/__init__.py Normal file
View file

View file

View file

@ -0,0 +1,3 @@
def remove_extra_spaces(df, column_name):
df_transformed = df.withColumn(column_name, F.regexp_replace(F.col(column_name), "\\s+", " "))
return df_transformed

View file

View file

View file

@ -0,0 +1,3 @@
def test_example_test():

View file

@ -0,0 +1,16 @@
def assert_df_equal(df1, df2):
try:
assert df1.schema() == df2.schema()
except AssertionError:
print('Error Schema')
print(df1.schema())
print(df1.schema())
try:
assert df1.equals(df2)
except AssertionError:
print('Error Schema')
df1.show()
df2.show()