-
Notifications
You must be signed in to change notification settings - Fork 2
/
eg2_flight_data_json.py
61 lines (42 loc) · 1.38 KB
/
eg2_flight_data_json.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
from pyspark.sql import SparkSession
from pyspark.sql.functions import expr,lit
spark=(
SparkSession
.builder
.master("local[*]")
.appName("eg_flight_data_json")
.getOrCreate()
)
df=(
spark.read.format("json")
.load("data/flight_data/json/2015-summary.json")
)
df.printSchema()
df.createOrReplaceTempView("dfTable")
df.select("DEST_COUNTRY_NAME").show(2)
spark.sql("""
SELECT DEST_COUNTRY_NAME FROM dfTable LIMIT 2
""").show()
df.select("DEST_COUNTRY_NAME","ORIGIN_COUNTRY_NAME").show(2)
df.select(expr("DEST_COUNTRY_NAME as destination")).show(2)
df.select(expr("DEST_COUNTRY_NAME as destination").alias("DEST")).show(2)
df.selectExpr("DEST_COUNTRY_NAME as newColumn","DEST_COUNTRY_NAME").show(4)
df.selectExpr(
"*",
"(DEST_COUNTRY_NAME = ORIGIN_COUNTRY_NAME) AS withinCountry"
).show(5,truncate=False)
df.selectExpr(
"AVG(count)",
"COUNT(DISTINCT(DEST_COUNTRY_NAME))"
).show()
df.selectExpr(
"*",
"1 AS One"
).show(5)
df.withColumn("columnOne",lit(1)).show()
df.withColumn("withInCountry",expr("DEST_COUNTRY_NAME=ORIGIN_COUNTRY_NAME")).show(5)
df.withColumn("dest",expr("DEST_COUNTRY_NAME")).show(5)
dfwithLongColumn=df.withColumn("This is long column-name",expr("DEST_COUNTRY_NAME"))
dfwithLongColumn.selectExpr("`This is long column-name`","`This is long column-name` AS `Long Col`").show()
print(df.rdd.getNumPartitions())
spark.stop()