Spark教程5-基本结构化操作

加载csv文件

python 复制代码

df = spark.read.format("json").load("/data/flight-data/json/2015-summary.json")

Schema

输出Schema

python 复制代码

df.printSchema()

使用Schema读取csv文件，以指定数据类型

python 复制代码

from pyspark.sql.types import StructField, StructType, StringType, LongType

mySchema = StructType(
    [
        StructField("DEST_COUNTRY_NAME", StringType(), True),
        StructField("ORIGIN_COUNTRY_NAME", StringType(), True),
        StructField("count", LongType(), False)
    ]
)
df = spark.read.format("json").schema(mySchema).load("/Users/yangyong/dev/learn_spark/2015-summary.json")

行

获取第一行

python 复制代码

df.first()

创建行

python 复制代码

from pyspark.sql import Row

myRow = Row("Hello", None, 1, False)

创建DataFrames

加载csv文件为DataFrames

python 复制代码

df = spark.read.format("json").load("/data/flight-data/json/2015-summary.json")

合并Schema和Rows为DataFrames

python 复制代码

Schema1 = StructType(
    [
        StructField("id", StringType(), True),
        StructField("name", StringType(), True),
        StructField("country", StringType(), True)
    ]
)

row1 = Row('1', 'Oscar', 'United States')
row2 = Row('2', 'China', 'England')
myDF = spark.createDataFrame([row1, row2], schema=Schema1)
myDF.show()

"""
+---+-----+-------------+
| id| name|      country|
+---+-----+-------------+
|  1|Oscar|United States|
|  2|China|      England|
+---+-----+-------------+
"""

两种查询：select和selectExpr

select

python 复制代码

from  pyspark.sql.functions import expr, col, column

df.select('dest_country_name').show(2)
df.select('dest_country_name', 'origin_country_name').show(2)
df.select(expr('dest_country_name'), col('dest_country_name'), column('dest_country_name')).show(2)

"""
+-----------------+
|dest_country_name|
+-----------------+
|    United States|
|    United States|
+-----------------+
only showing top 2 rows

+-----------------+-------------------+
|dest_country_name|origin_country_name|
+-----------------+-------------------+
|    United States|            Romania|
|    United States|            Croatia|
+-----------------+-------------------+
only showing top 2 rows

+-----------------+-----------------+-----------------+
|dest_country_name|dest_country_name|dest_country_name|
+-----------------+-----------------+-----------------+
|    United States|    United States|    United States|
|    United States|    United States|    United States|
+-----------------+-----------------+-----------------+
only showing top 2 rows
"""

列重命名

python 复制代码

df.select(expr('dest_country_name as destination')).show(2)
df.select(col('dest_country_name').alias('destination')).show(2)

"""
+-------------+
|  destination|
+-------------+
|United States|
|United States|
+-------------+
only showing top 2 rows

+-------------+
|  destination|
+-------------+
|United States|
|United States|
+-------------+
only showing top 2 rows
"""

selectExpr

列重命名

python 复制代码

df.selectExpr('dest_country_name as destination', 'dest_country_name').show(2)

"""
+-------------+-----------------+
|  destination|dest_country_name|
+-------------+-----------------+
|United States|    United States|
|United States|    United States|
+-------------+-----------------+
only showing top 2 rows
"""

新增列

python 复制代码

df.selectExpr('*', '(dest_country_name = origin_country_name) as withinCountry').show(2)

"""
+-----------------+-------------------+-----+-------------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|withinCountry|
+-----------------+-------------------+-----+-------------+
|    United States|            Romania|   15|        false|
|    United States|            Croatia|    1|        false|
+-----------------+-------------------+-----+-------------+
only showing top 2 rows
"""

相当于SQL

sql 复制代码

SELECT *, (dest_country_name = origin_country_name) as withinCountry 
FROM dfTable limit 2

使用聚合函数

python 复制代码

df.selectExpr('avg(count)', 'count(distinct(dest_country_name))').show(2)

"""
+-----------+---------------------------------+
| avg(count)|count(DISTINCT dest_country_name)|
+-----------+---------------------------------+
|1770.765625|                              132|
+-----------+---------------------------------+
"""

添加列 withColumn

python 复制代码

from pyspark.sql.functions import lit

df.withColumn('numberOne', lit(1)).show(2)
df.withColumn('withinCountry', expr('dest_country_name == origin_country_name')).show(2)

"""
+-----------------+-------------------+-----+---------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|numberOne|
+-----------------+-------------------+-----+---------+
|    United States|            Romania|   15|        1|
|    United States|            Croatia|    1|        1|
+-----------------+-------------------+-----+---------+
only showing top 2 rows

+-----------------+-------------------+-----+-------------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|withinCountry|
+-----------------+-------------------+-----+-------------+
|    United States|            Romania|   15|        false|
|    United States|            Croatia|    1|        false|
+-----------------+-------------------+-----+-------------+
only showing top 2 rows
"""

列重命名 withColumnRenamed

python 复制代码

df.withColumnRenamed('dest_country_name', 'dest').show(2)

"""
+-------------+-------------------+-----+
|         dest|ORIGIN_COUNTRY_NAME|count|
+-------------+-------------------+-----+
|United States|            Romania|   15|
|United States|            Croatia|    1|
+-------------+-------------------+-----+
only showing top 2 rows
"""

去掉列

python 复制代码

df.drop('origin_country_name').show(2)
"""
+-----------------+-----+
|DEST_COUNTRY_NAME|count|
+-----------------+-----+
|    United States|   15|
|    United States|    1|
+-----------------+-----+
only showing top 2 rows
"""

修改列类型

python 复制代码

df.withColumn('count2', col('count').cast('long'))

行过滤 filter/where

这两者是等价的

python 复制代码

df.filter('count < 2').show(2)
df.where('count < 2').show(2)
df.where(col('count') < 2).show(2)

"""
+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Croatia|    1|
|    United States|          Singapore|    1|
+-----------------+-------------------+-----+
only showing top 2 rows

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Croatia|    1|
|    United States|          Singapore|    1|
+-----------------+-------------------+-----+
only showing top 2 rows

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Croatia|    1|
|    United States|          Singapore|    1|
+-----------------+-------------------+-----+
only showing top 2 rows
"""

多个条件过滤

python 复制代码

df.where('count < 2').where('dest_country_name != "United States"').show(2)

"""
+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|          Moldova|      United States|    1|
|            Malta|      United States|    1|
+-----------------+-------------------+-----+
only showing top 2 rows
"""

去重

python 复制代码

df.select('dest_country_name', 'origin_country_name').distinct().count()

"""
equal to SQL:
SELECT COUNT(DISTINCT(dest_country_name, origin_country_name)) FROM dfTable;
"""

合并DataFrames

拥有同样的Schema以及columns才能合并

python 复制代码

from pyspark.sql import Row
schema = df.schema
newRows = [
    Row("New Country", "Other Country", 5),
    Row("New Country 2", "Other Country 3", 1)
]
newDF = spark.createDataFrame(newRows, schema)

# in Python
df.union(newDF)\
   .where("count = 1")\
   .where(col("ORIGIN_COUNTRY_NAME") != "United States")\
   .show()
"""
+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Croatia|    1|
|    United States|          Singapore|    1|
|    United States|          Gibraltar|    1|
|    United States|             Cyprus|    1|
|    United States|            Estonia|    1|
|    United States|          Lithuania|    1|
|    United States|           Bulgaria|    1|
|    United States|            Georgia|    1|
|    United States|            Bahrain|    1|
|    United States|   Papua New Guinea|    1|
|    United States|         Montenegro|    1|
|    United States|            Namibia|    1|
|    New Country 2|    Other Country 3|    1|
+-----------------+-------------------+-----+
"""

行排序 sort/orderBy

两种方式等价

python 复制代码

df.sort("count").show(5)
df.orderBy("count", "DEST_COUNTRY_NAME").show(5)
df.orderBy(col("count"), col("DEST_COUNTRY_NAME")).show(5)

from pyspark.sql.functions import desc, asc

df.orderBy(expr("count desc")).show(2)
df.orderBy(col("count").desc(), col("DEST_COUNTRY_NAME").asc()).show(2)

Limit

python 复制代码

df.limit(5).show()
df.orderBy(expr("count desc")).limit(6).show()