Add PySpark Demo (#34)

geenen124 · web-flow · commit 3495ea90067d · 2022-07-20T13:34:47.000+02:00
diff --git a/demo/README.md b/demo/README.md
@@ -9,6 +9,8 @@ This demo is composed of 3 parts:
 - `ReadWriteDemo`: reads the ArangoDB collections created above as Spark Dataframes, applies projections and filtering,
   writes to a new ArangoDB collection
 
+There are demos available written in Scala & Python (using PySpark) as outlined below.
+
 ## Requirements
 
 This demo requires:
@@ -17,6 +19,9 @@ This demo requires:
 - `maven`
 - `docker`
 
+For the python demo, you will also need
+- `python`
+
 ## Prepare the environment
 
 Set environment variables:
@@ -79,3 +84,26 @@ docker run -it --rm \
     --packages="com.arangodb:arangodb-spark-datasource-3.2_2.12:$ARANGO_SPARK_VERSION" \
     --class Demo /demo/target/demo-$ARANGO_SPARK_VERSION.jar
 ```
+
+## Python(PySpark) Demo
+
+This demo requires the same environment setup as outlined above.
+Additionally, the python requirements will need to be installed as follows:
+```shell
+pip install -r ./python-demo/requirements.txt
+```
+
+To run the PySpark demo, run 
+```shell
+python ./python-demo/demo.py \
+  --endpoints=172.28.0.1:8529,172.28.0.1:8539,172.28.0.1:8549
+```
+
+To run it against an Oasis deployment, run
+```shell
+python ./python-demo/demo.py \
+  --password=<root-password> \
+  --endpoints=<endpoint> \
+  --ssl-enabled=true \
+  --ssl-cert-value=<base64-encoded-cert>
+```
diff --git a/demo/python-demo/demo.py b/demo/python-demo/demo.py
@@ -0,0 +1,56 @@
+import os
+import pathlib
+from argparse import ArgumentParser
+from typing import Dict
+
+from pyspark.sql import SparkSession
+
+from read_write_demo import read_write_demo
+from read_demo import read_demo
+from write_demo import write_demo
+
+
+def create_spark_session() -> SparkSession:
+    # Here we can initialize the spark session, and in doing so,
+    # include the ArangoDB Spark DataSource package
+    arango_spark_version = os.environ["ARANGO_SPARK_VERSION"]
+
+    spark = SparkSession.builder \
+        .appName("ArangoDBPySparkDataTypesExample") \
+        .master("local[*]") \
+        .config("spark.jars.packages", f"com.arangodb:arangodb-spark-datasource-3.2_2.12:{arango_spark_version}") \
+        .getOrCreate()
+
+    return spark
+
+
+def create_base_arangodb_datasource_opts(password: str, endpoints: str, ssl_enabled: str, ssl_cert_value: str) -> Dict[str, str]:
+    return {
+        "password": password,
+        "endpoints": endpoints,
+        "ssl.enabled": ssl_enabled,
+        "ssl.cert.value": ssl_cert_value,
+    }
+
+
+def main():
+    parser = ArgumentParser()
+    parser.add_argument("--import-path", default=None)
+    parser.add_argument("--password", default="test")
+    parser.add_argument("--endpoints", default="localhost:8529")
+    parser.add_argument("--ssl-enabled", default="false")
+    parser.add_argument("--ssl-cert-value", default="")
+    args = parser.parse_args()
+
+    if args.import_path is None:
+        args.import_path = pathlib.Path(__file__).resolve().parent.parent / "docker" / "import"
+
+    spark = create_spark_session()
+    base_opts = create_base_arangodb_datasource_opts(args.password, args.endpoints, args.ssl_enabled, args.ssl_cert_value)
+    write_demo(spark, base_opts, args.import_path)
+    read_demo(spark, base_opts)
+    read_write_demo(spark, base_opts)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/demo/python-demo/read_demo.py b/demo/python-demo/read_demo.py
@@ -0,0 +1,63 @@
+from typing import Dict
+
+import pyspark.sql
+from pyspark.sql import SparkSession
+from pyspark.sql.types import StructType, StructField, StringType
+
+from schemas import movie_schema
+from utils import combine_dicts
+
+
+def read_demo(spark: SparkSession, base_opts: Dict[str, str]):
+    movies_df = read_collection(spark, "movies", base_opts, movie_schema)
+
+    print("Read table: history movies or documentaries about 'World War' released from 2000-01-01")
+    # We can get to what we want in 2 different ways:
+    # First, the PySpark dataframe way...
+    movies_df \
+        .select("title", "releaseDate", "genre", "description") \
+        .filter("genre IN ('History', 'Documentary') AND description LIKE '%World War%' AND releaseDate > '2000'") \
+        .show()
+
+    # Second, in the Pandas on Spark way...
+    movies_pd_df = movies_df.to_pandas_on_spark()
+    subset = movies_pd_df[["title", "releaseDate", "genre", "description"]]
+    recent_ww_movies = subset[subset["genre"].isin(["History", "Documentary"])\
+                              & (subset["releaseDate"] >= '2000')\
+                              & subset["description"].str.contains("World War")]
+    print(recent_ww_movies)
+
+    print("Read query: actors of movies directed by Clint Eastwood with related movie title and interpreted role")
+    read_aql_query(
+        spark,
+        """WITH movies, persons
+          FOR v, e, p IN 2 ANY "persons/1062" OUTBOUND directed, INBOUND actedIn
+             RETURN {movie: p.vertices[1].title, name: v.name, role: p.edges[1].name}
+        """,
+        base_opts,
+        StructType([
+            StructField("movie", StringType()),
+            StructField("name", StringType()),
+            StructField("role", StringType())
+        ])
+    ).show(20, 200)
+
+
+def read_collection(spark: SparkSession, collection_name: str, base_opts: Dict[str, str], schema: StructType) -> pyspark.sql.DataFrame:
+    arangodb_datasource_options = combine_dicts([base_opts, {"table": collection_name}])
+
+    return spark.read \
+        .format("com.arangodb.spark") \
+        .options(**arangodb_datasource_options) \
+        .schema(schema) \
+        .load()
+
+
+def read_aql_query(spark: SparkSession, query: str, base_opts: Dict[str, str], schema: StructType) -> pyspark.sql.DataFrame:
+    arangodb_datasource_options = combine_dicts([base_opts, {"query": query}])
+
+    return spark.read \
+        .format("com.arangodb.spark") \
+        .options(**arangodb_datasource_options) \
+        .schema(schema) \
+        .load()
diff --git a/demo/python-demo/read_write_demo.py b/demo/python-demo/read_write_demo.py
@@ -0,0 +1,20 @@
+from typing import Dict
+
+from pyspark.sql import SparkSession
+
+import read_demo
+import write_demo
+from schemas import movie_schema
+
+
+def read_write_demo(spark: SparkSession, opts: Dict[str, str]):
+    print("-----------------------")
+    print("--- READ-WRITE DEMO ---")
+    print("-----------------------")
+
+    print("Reading 'movies' collection and writing 'actionMovies' collection...")
+    action_movies_df = read_demo.read_collection(spark, "movies", opts, movie_schema)\
+        .select("_key", "title", "releaseDate", "runtime", "description")\
+        .filter("genre = 'Action'")
+    write_demo.save_df(action_movies_df.to_pandas_on_spark(), "actionMovies", opts)
+    print("You can now view the actionMovies collection in ArangoDB!")
diff --git a/demo/python-demo/requirements.txt b/demo/python-demo/requirements.txt
@@ -0,0 +1 @@
+pyspark[pandas_on_spark]==3.2.1
diff --git a/demo/python-demo/schemas.py b/demo/python-demo/schemas.py
@@ -0,0 +1,50 @@
+from pyspark.sql.types import StructType, StructField, StringType, TimestampType, DateType, IntegerType
+
+movie_schema: StructType = StructType([
+    StructField("_id", StringType(), nullable=False),
+    StructField("_key", StringType(), nullable=False),
+    StructField("description", StringType()),
+    StructField("genre", StringType()),
+    StructField("homepage", StringType()),
+    StructField("imageUrl", StringType()),
+    StructField("imdbId", StringType()),
+    StructField("language", StringType()),
+    StructField("lastModified", TimestampType()),
+    StructField("releaseDate", DateType()),
+    StructField("runtime", IntegerType()),
+    StructField("studio", StringType()),
+    StructField("tagline", StringType()),
+    StructField("title", StringType()),
+    StructField("trailer", StringType())
+])
+person_schema: StructType = StructType([
+    StructField("_id", StringType(), nullable=False),
+    StructField("_key", StringType(), nullable=False),
+    StructField("biography", StringType()),
+    StructField("birthday", DateType()),
+    StructField("birthplace", StringType()),
+    StructField("lastModified", TimestampType()),
+    StructField("name", StringType()),
+    StructField("profileImageUrl", StringType())
+])
+edges_schema: StructType = StructType([
+    StructField("_key", StringType(), nullable=False),
+    StructField("_from", StringType(), nullable=False),
+    StructField("_to", StringType(), nullable=False),
+    StructField("$label", StringType()),
+    StructField("name", StringType()),
+    StructField("type", StringType()),
+])
+acts_in_schema: StructType = StructType([
+    StructField("_id", StringType(), nullable=False),
+    StructField("_key", StringType(), nullable=False),
+    StructField("_from", StringType(), nullable=False),
+    StructField("_to", StringType(), nullable=False),
+    StructField("name", StringType())
+])
+directed_schema: StructType = StructType([
+    StructField("_id", StringType(), nullable=False),
+    StructField("_key", StringType(), nullable=False),
+    StructField("_from", StringType(), nullable=False),
+    StructField("_to", StringType(), nullable=False)
+])
diff --git a/demo/python-demo/utils.py b/demo/python-demo/utils.py
@@ -0,0 +1,5 @@
+def combine_dicts(list_of_dicts):
+    whole_dict = {}
+    for d in list_of_dicts:
+        whole_dict.update(d)
+    return whole_dict
diff --git a/demo/python-demo/write_demo.py b/demo/python-demo/write_demo.py
@@ -0,0 +1,84 @@
+import datetime
+import pathlib
+from typing import Dict
+
+from pyspark import pandas as ps
+from pyspark.sql import SparkSession, functions as f
+from pyspark.sql.types import StructType
+
+from utils import combine_dicts
+from schemas import person_schema, movie_schema, directed_schema, acts_in_schema
+
+
+def save_df(ps_df, table_name: str, options: Dict[str, str], table_type: str = None) -> None:
+    if not table_type:
+        table_type = "document"
+
+    all_opts = combine_dicts([options, {
+        "table.shards": "9",
+        "confirmTruncate": "true",
+        "overwriteMode": "replace",
+        "table": table_name,
+        "table.type": table_type
+    }])
+
+    ps_df.to_spark()\
+        .write\
+        .mode("overwrite")\
+        .format("com.arangodb.spark")\
+        .options(**all_opts)\
+        .save()
+
+
+def write_demo(spark: SparkSession, save_opts: Dict[str, str], import_path_str: str):
+    import_path = pathlib.Path(import_path_str)
+
+    print("Read Nodes from JSONL using Pandas on Spark API")
+    nodes_pd_df = ps.read_json(str(import_path / "nodes.jsonl"))
+    nodes_pd_df = nodes_pd_df[nodes_pd_df["_key"].notnull()]
+    nodes_pd_df["releaseDate"] = ps.to_datetime(nodes_pd_df["releaseDate"], unit="ms")
+    nodes_pd_df["birthday"] = ps.to_datetime(nodes_pd_df["birthday"], unit="ms")
+
+    def convert_to_timestamp(to_modify, column):
+        tz_aware_datetime = datetime.datetime.utcfromtimestamp(
+                               int(to_modify[column])/1000
+                           ).replace(tzinfo=datetime.timezone.utc).astimezone(tz=None)
+        tz_naive = tz_aware_datetime.replace(tzinfo=None)
+        to_modify[column] = tz_naive
+        return to_modify
+
+    nodes_pd_df = nodes_pd_df.apply(convert_to_timestamp, axis=1, args=("lastModified",))
+
+    nodes_df = nodes_pd_df.to_spark()
+    nodes_pd_df = nodes_df\
+        .withColumn("releaseDate", f.to_date(nodes_df["releaseDate"])) \
+        .withColumn("birthday", f.to_date(nodes_df["birthday"])) \
+        .to_pandas_on_spark()
+
+    print("Read Edges from JSONL using PySpark API")
+    edges_df = spark.read.json(str(import_path / "edges.jsonl"))
+    # apply the schema to change nullability of _key, _from, and _to columns in schema
+    edges_pd_df = edges_df.to_pandas_on_spark()
+    edges_pd_df["_from"] = "persons/" + edges_pd_df["_from"]
+    edges_pd_df["_to"] = "movies/" + edges_pd_df["_to"]
+
+    print("Create the collection dfs")
+    persons_df = nodes_pd_df[nodes_pd_df["type"] == "Person"][person_schema.fieldNames()[1:]]
+    movies_df = nodes_pd_df[nodes_pd_df["type"] == "Movie"][movie_schema.fieldNames()[1:]]
+    directed_df = edges_pd_df[edges_pd_df["$label"] == "DIRECTED"][directed_schema.fieldNames()[1:]]
+    acted_in_df = edges_pd_df[edges_pd_df["$label"] == "ACTS_IN"][acts_in_schema.fieldNames()[1:]]
+
+    # _from and _to need to be set with nullable=False in the schema in order for it to work
+    directed_df = spark.createDataFrame(directed_df.to_spark().rdd, StructType(
+        directed_schema.fields[1:])).to_pandas_on_spark()
+    acted_in_df = spark.createDataFrame(acted_in_df.to_spark().rdd, StructType(
+        acts_in_schema.fields[1:])).to_pandas_on_spark()
+
+    print("writing the persons collection")
+    save_df(persons_df, "persons", save_opts)
+    print("writing the movies collection")
+    save_df(movies_df, "movies", save_opts)
+    print("writing the 'directed' edge collection")
+    save_df(directed_df, "directed", save_opts, "edge")
+    print("writing the 'actedIn' collection")
+    save_df(acted_in_df, "actedIn", save_opts, "edge")