Great Introductions by author for spark. And I just screenshoted the important and clear three concepts diagram for references :)
In this data flow diagram, "Spark Apply ML" represents two main functionalities:
# Loading and applying model in Spark
from pyspark.ml import PipelineModel
# Load trained model
loaded_model = PipelineModel.load("path_to_model")
# Create Spark streaming for model application
def apply_model(streaming_df):
# Make predictions on real-time data
predictions = loaded_model.transform(streaming_df)
return predictions
# Spark batch prediction example
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("ModelApplication").getOrCreate()
def batch_predict():
# Read data for prediction
data = spark.read.parquet("input_data")
# Apply model for predictions
predictions = loaded_model.transform(data)
# Save prediction results
predictions.write.parquet("predictions_output")
// Spark can process large-scale prediction tasks in distributed manner
predictions = spark.sparkContext.parallelize(data)\\\\
.mapPartitions(lambda partition: model.predict(partition))
# Configure Spark resources to optimize model application
spark.conf.set("spark.executor.memory", "10g")
spark.conf.set("spark.executor.cores", "4")
# End-to-end data processing and model application
def end_to_end_pipeline():
# Data preprocessing
processed_data = spark.sql("""
SELECT * FROM raw_data
WHERE quality > 0.8
""")
# Apply model
results = loaded_model.transform(processed_data)
# Post-processing
final_results = results.select(
"prediction",
"probability",
"features"
)