Add env python for Spark
This commit is contained in:
parent
1e42b00ce7
commit
e28c446569
8 changed files with 79 additions and 32 deletions
2
.idea/misc.xml
generated
2
.idea/misc.xml
generated
|
|
@ -3,5 +3,5 @@
|
||||||
<component name="Black">
|
<component name="Black">
|
||||||
<option name="sdkName" value="PySpark" />
|
<option name="sdkName" value="PySpark" />
|
||||||
</component>
|
</component>
|
||||||
<component name="ProjectRootManager" version="2" project-jdk-name="PySpark_3_10" project-jdk-type="Python SDK" />
|
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.11 (PySpark Training Repository)" project-jdk-type="Python SDK" />
|
||||||
</project>
|
</project>
|
||||||
3
.pylintrc
Normal file
3
.pylintrc
Normal file
|
|
@ -0,0 +1,3 @@
|
||||||
|
[BASIC]
|
||||||
|
# Good variable names which should always be accepted, separated by a comma.
|
||||||
|
good-names=i, j, k, df
|
||||||
43
README.md
43
README.md
|
|
@ -1,20 +1,31 @@
|
||||||
# Python PySpark Training Repository
|
Python PySpark Training Repository
|
||||||
|
==============
|
||||||
|
**Author:** *Yûki VACHOT*
|
||||||
|
|
||||||
## Installation
|
**Updated:** **10/01/24**
|
||||||
- [Python 3.10](https://www.python.org/downloads/)
|
# CONTENT TABLE
|
||||||
- pyspark=3.1.1
|
|
||||||
- findspark
|
|
||||||
- pyspark-test
|
|
||||||
- [Spark 3.1.1](https://spark.apache.org/downloads.html)
|
|
||||||
- [Hadoop 3.3.6](https://hadoop.apache.org/releases.html)
|
|
||||||
- [Java JDK 11](https://www.oracle.com/java/technologies/downloads/#java11)
|
|
||||||
- (not mandatory) [Anaconda for conda](https://www.anaconda.com/download/)
|
|
||||||
|
|
||||||
## Run Python Test
|
|
||||||
- path from src/test_pyspark_training
|
|
||||||
- `pytest -k test_`
|
|
||||||
|
|
||||||
## Run pylint for code check
|
---
|
||||||
|
# Installation
|
||||||
|
|
||||||
## Run Python doc with Sphinx
|
`python -m venv `
|
||||||
|
|
||||||
|
- [Python 3.11.7](https://www.python.org/downloads/)
|
||||||
|
- [Spark 3.5.0 with Hadoop 3.0.0](https://spark.apache.org/downloads.html)
|
||||||
|
- [winutils.exe, .pdb and hadoop.dll](https://github.com/steveloughran/winutils/tree/master/hadoop-3.0.0/bin)
|
||||||
|
- [Java JDK 17](https://www.azul.com/downloads/?version=java-17-lts&package=jdk#zulu)
|
||||||
|
---
|
||||||
|
# Run Python PySpark
|
||||||
|
- `python init.py`
|
||||||
|
---
|
||||||
|
# Run Python Test
|
||||||
|
- path from src/test_pyspark_training
|
||||||
|
- `pytest -k test_`
|
||||||
|
---
|
||||||
|
# Run pylint for code check
|
||||||
|
|
||||||
|
---
|
||||||
|
# Run Python doc with Sphinx
|
||||||
|
|
||||||
|
---
|
||||||
19
init.py
19
init.py
|
|
@ -1,24 +1,21 @@
|
||||||
import os
|
import os
|
||||||
import findspark
|
import sys
|
||||||
|
from dotenv import load_dotenv
|
||||||
from pyspark.sql import SparkSession
|
from pyspark.sql import SparkSession
|
||||||
from src.pyspark_training.output_dataset_1.compute_output_dataset_1 import compute_output_dataset_1
|
from src.pyspark_training.output_dataset_1.compute_output_dataset_1 import compute_output_dataset_1
|
||||||
|
|
||||||
|
|
||||||
def init_env():
|
|
||||||
os.environ["JAVA_HOME"] = "C:\\Program Files\\Java\\jdk-11"
|
|
||||||
os.environ["SPARK_HOME"] = "C:\\SPARK\\spark-3.1.1-bin-hadoop3.2"
|
|
||||||
os.environ["HADOOP_HOME"] = "C:\\SPARK\\hadoop"
|
|
||||||
|
|
||||||
findspark.init()
|
|
||||||
|
|
||||||
|
|
||||||
def init_spark():
|
def init_spark():
|
||||||
return SparkSession.builder.master("local[*]").getOrCreate()
|
return SparkSession.builder.master("local[*]").getOrCreate()
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
print("hey there")
|
load_dotenv()
|
||||||
init_env()
|
print(os.environ["SPARK_HOME"]) # spark-3.5.0-bin-hadoop3
|
||||||
|
print(os.environ["HADOOP_HOME"]) # spark-3.5.0-bin-hadoop3, + winutils et dll hadoop 3.0
|
||||||
|
print(os.environ["JAVA_HOME"]) # java 8 local (zulu)
|
||||||
|
print("EXEC:")
|
||||||
|
print(sys.executable)
|
||||||
spark_session = init_spark()
|
spark_session = init_spark()
|
||||||
|
|
||||||
compute_output_dataset_1(spark_session)
|
compute_output_dataset_1(spark_session)
|
||||||
|
|
|
||||||
6
requirements.txt
Normal file
6
requirements.txt
Normal file
|
|
@ -0,0 +1,6 @@
|
||||||
|
pyspark
|
||||||
|
pyspark-test
|
||||||
|
python-dotenv
|
||||||
|
pytest
|
||||||
|
pylint
|
||||||
|
sphinx
|
||||||
27
spark_check.py
Normal file
27
spark_check.py
Normal file
|
|
@ -0,0 +1,27 @@
|
||||||
|
from dotenv import load_dotenv
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
from pyspark.sql import SparkSession
|
||||||
|
|
||||||
|
load_dotenv()
|
||||||
|
|
||||||
|
print(os.environ["SPARK_HOME"])
|
||||||
|
print(os.environ["HADOOP_HOME"])
|
||||||
|
print(os.environ["JAVA_HOME"])
|
||||||
|
print("EXEC:")
|
||||||
|
print(sys.executable)
|
||||||
|
|
||||||
|
spark = SparkSession.builder.getOrCreate()
|
||||||
|
df = spark.createDataFrame(
|
||||||
|
[
|
||||||
|
(1, "val1"),
|
||||||
|
(2, "val2"),
|
||||||
|
(3, "val3"),
|
||||||
|
(4, "val4"),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
df.show()
|
||||||
|
|
||||||
|
df.coalesce(1).write.mode("overwrite").csv("output/testoutput")
|
||||||
|
spark.stop()
|
||||||
|
print("Done\n")
|
||||||
|
|
@ -1,11 +1,14 @@
|
||||||
|
"""
|
||||||
|
This module is used to compute the dataset output_dataset_1.
|
||||||
|
"""
|
||||||
from pyspark.sql import SparkSession
|
from pyspark.sql import SparkSession
|
||||||
from pyspark.sql.types import StructType, StructField, IntegerType, StringType
|
from pyspark.sql.types import StructType, StructField, IntegerType, StringType
|
||||||
|
|
||||||
import src.pyspark_training.output_dataset_1.cleaning_output_dataset_1 as C
|
import src.pyspark_training.output_dataset_1.cleaning_output_dataset_1 as C
|
||||||
import src.pyspark_training.output_dataset_1.processing_output_dataset_1 as P
|
import src.pyspark_training.output_dataset_1.processing_output_dataset_1 as P
|
||||||
|
|
||||||
|
|
||||||
INPUT_DATASET_1_PATH = './assets/output_dataset_1/raw/RAW_input_output_dataset_1.csv'
|
INPUT_DATASET_1_PATH = './assets/output_dataset_1/raw/RAW_input_output_dataset_1.csv'
|
||||||
OUTPUT_DATASET_1_PATH = './assets/output_dataset_1/output/OUTPUT_output_dataset_1.csv'
|
OUTPUT_DATASET_1_PATH = '.\\assets\\output_dataset_1\\output\\OUTPUT_output_dataset_1.csv'
|
||||||
|
|
||||||
|
|
||||||
def compute_output_dataset_1(spark_session: SparkSession):
|
def compute_output_dataset_1(spark_session: SparkSession):
|
||||||
|
|
@ -27,4 +30,4 @@ def compute_output_dataset_1(spark_session: SparkSession):
|
||||||
df = P.add_life_stage(cleaned_df)
|
df = P.add_life_stage(cleaned_df)
|
||||||
|
|
||||||
df.show()
|
df.show()
|
||||||
df.write.mode('overwrite').csv(OUTPUT_DATASET_1_PATH)
|
df.coalesce(1).write.mode('overwrite').save(OUTPUT_DATASET_1_PATH, format='csv')
|
||||||
|
|
|
||||||
|
|
@ -16,7 +16,7 @@ def spark_session(request):
|
||||||
os.environ['PYSPARK_PYTHON'] = sys.executable
|
os.environ['PYSPARK_PYTHON'] = sys.executable
|
||||||
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable
|
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable
|
||||||
os.environ["JAVA_HOME"] = "C:\\Program Files\\Java\\jdk-11"
|
os.environ["JAVA_HOME"] = "C:\\Program Files\\Java\\jdk-11"
|
||||||
os.environ["SPARK_HOME"] = "C:\\SPARK\\spark-3.1.1-bin-hadoop3.2"
|
os.environ["SPARK_HOME"] = "C:\\SPARK\\spark-3.1.1-bin-hadoop3.2\\bin"
|
||||||
os.environ["HADOOP_HOME"] = "C:\\SPARK\\hadoop"
|
os.environ["HADOOP_HOME"] = "C:\\SPARK\\hadoop"
|
||||||
|
|
||||||
findspark.init()
|
findspark.init()
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue