加载csv文件
python
df = spark.read.format("json").load("/data/flight-data/json/2015-summary.json")
Schema
输出Schema
python
df.printSchema()
使用Schema读取csv文件,以指定数据类型
python
from pyspark.sql.types import StructField, StructType, StringType, LongType
mySchema = StructType(
[
StructField("DEST_COUNTRY_NAME", StringType(), True),
StructField("ORIGIN_COUNTRY_NAME", StringType(), True),
StructField("count", LongType(), False)
]
)
df = spark.read.format("json").schema(mySchema).load("/Users/yangyong/dev/learn_spark/2015-summary.json")
行
获取第一行
python
df.first()
创建行
python
from pyspark.sql import Row
myRow = Row("Hello", None, 1, False)
创建DataFrames
加载csv文件为DataFrames
python
df = spark.read.format("json").load("/data/flight-data/json/2015-summary.json")
合并Schema和Rows为DataFrames
python
Schema1 = StructType(
[
StructField("id", StringType(), True),
StructField("name", StringType(), True),
StructField("country", StringType(), True)
]
)
row1 = Row('1', 'Oscar', 'United States')
row2 = Row('2', 'China', 'England')
myDF = spark.createDataFrame([row1, row2], schema=Schema1)
myDF.show()
"""
+---+-----+-------------+
| id| name| country|
+---+-----+-------------+
| 1|Oscar|United States|
| 2|China| England|
+---+-----+-------------+
"""
两种查询:select和selectExpr
select
python
from pyspark.sql.functions import expr, col, column
df.select('dest_country_name').show(2)
df.select('dest_country_name', 'origin_country_name').show(2)
df.select(expr('dest_country_name'), col('dest_country_name'), column('dest_country_name')).show(2)
"""
+-----------------+
|dest_country_name|
+-----------------+
| United States|
| United States|
+-----------------+
only showing top 2 rows
+-----------------+-------------------+
|dest_country_name|origin_country_name|
+-----------------+-------------------+
| United States| Romania|
| United States| Croatia|
+-----------------+-------------------+
only showing top 2 rows
+-----------------+-----------------+-----------------+
|dest_country_name|dest_country_name|dest_country_name|
+-----------------+-----------------+-----------------+
| United States| United States| United States|
| United States| United States| United States|
+-----------------+-----------------+-----------------+
only showing top 2 rows
"""
列重命名
python
df.select(expr('dest_country_name as destination')).show(2)
df.select(col('dest_country_name').alias('destination')).show(2)
"""
+-------------+
| destination|
+-------------+
|United States|
|United States|
+-------------+
only showing top 2 rows
+-------------+
| destination|
+-------------+
|United States|
|United States|
+-------------+
only showing top 2 rows
"""
selectExpr
列重命名
python
df.selectExpr('dest_country_name as destination', 'dest_country_name').show(2)
"""
+-------------+-----------------+
| destination|dest_country_name|
+-------------+-----------------+
|United States| United States|
|United States| United States|
+-------------+-----------------+
only showing top 2 rows
"""
新增列
python
df.selectExpr('*', '(dest_country_name = origin_country_name) as withinCountry').show(2)
"""
+-----------------+-------------------+-----+-------------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|withinCountry|
+-----------------+-------------------+-----+-------------+
| United States| Romania| 15| false|
| United States| Croatia| 1| false|
+-----------------+-------------------+-----+-------------+
only showing top 2 rows
"""
相当于SQL
sql
SELECT *, (dest_country_name = origin_country_name) as withinCountry
FROM dfTable limit 2
使用聚合函数
python
df.selectExpr('avg(count)', 'count(distinct(dest_country_name))').show(2)
"""
+-----------+---------------------------------+
| avg(count)|count(DISTINCT dest_country_name)|
+-----------+---------------------------------+
|1770.765625| 132|
+-----------+---------------------------------+
"""
添加列 withColumn
python
from pyspark.sql.functions import lit
df.withColumn('numberOne', lit(1)).show(2)
df.withColumn('withinCountry', expr('dest_country_name == origin_country_name')).show(2)
"""
+-----------------+-------------------+-----+---------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|numberOne|
+-----------------+-------------------+-----+---------+
| United States| Romania| 15| 1|
| United States| Croatia| 1| 1|
+-----------------+-------------------+-----+---------+
only showing top 2 rows
+-----------------+-------------------+-----+-------------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|withinCountry|
+-----------------+-------------------+-----+-------------+
| United States| Romania| 15| false|
| United States| Croatia| 1| false|
+-----------------+-------------------+-----+-------------+
only showing top 2 rows
"""
列重命名 withColumnRenamed
python
df.withColumnRenamed('dest_country_name', 'dest').show(2)
"""
+-------------+-------------------+-----+
| dest|ORIGIN_COUNTRY_NAME|count|
+-------------+-------------------+-----+
|United States| Romania| 15|
|United States| Croatia| 1|
+-------------+-------------------+-----+
only showing top 2 rows
"""
去掉列
python
df.drop('origin_country_name').show(2)
"""
+-----------------+-----+
|DEST_COUNTRY_NAME|count|
+-----------------+-----+
| United States| 15|
| United States| 1|
+-----------------+-----+
only showing top 2 rows
"""
修改列类型
python
df.withColumn('count2', col('count').cast('long'))
行过滤 filter/where
这两者是等价的
python
df.filter('count < 2').show(2)
df.where('count < 2').show(2)
df.where(col('count') < 2).show(2)
"""
+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
| United States| Croatia| 1|
| United States| Singapore| 1|
+-----------------+-------------------+-----+
only showing top 2 rows
+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
| United States| Croatia| 1|
| United States| Singapore| 1|
+-----------------+-------------------+-----+
only showing top 2 rows
+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
| United States| Croatia| 1|
| United States| Singapore| 1|
+-----------------+-------------------+-----+
only showing top 2 rows
"""
多个条件过滤
python
df.where('count < 2').where('dest_country_name != "United States"').show(2)
"""
+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
| Moldova| United States| 1|
| Malta| United States| 1|
+-----------------+-------------------+-----+
only showing top 2 rows
"""
去重
python
df.select('dest_country_name', 'origin_country_name').distinct().count()
"""
equal to SQL:
SELECT COUNT(DISTINCT(dest_country_name, origin_country_name)) FROM dfTable;
"""
合并DataFrames
拥有同样的Schema以及columns才能合并
python
from pyspark.sql import Row
schema = df.schema
newRows = [
Row("New Country", "Other Country", 5),
Row("New Country 2", "Other Country 3", 1)
]
newDF = spark.createDataFrame(newRows, schema)
# in Python
df.union(newDF)\
.where("count = 1")\
.where(col("ORIGIN_COUNTRY_NAME") != "United States")\
.show()
"""
+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
| United States| Croatia| 1|
| United States| Singapore| 1|
| United States| Gibraltar| 1|
| United States| Cyprus| 1|
| United States| Estonia| 1|
| United States| Lithuania| 1|
| United States| Bulgaria| 1|
| United States| Georgia| 1|
| United States| Bahrain| 1|
| United States| Papua New Guinea| 1|
| United States| Montenegro| 1|
| United States| Namibia| 1|
| New Country 2| Other Country 3| 1|
+-----------------+-------------------+-----+
"""
行排序 sort/orderBy
两种方式等价
python
df.sort("count").show(5)
df.orderBy("count", "DEST_COUNTRY_NAME").show(5)
df.orderBy(col("count"), col("DEST_COUNTRY_NAME")).show(5)
from pyspark.sql.functions import desc, asc
df.orderBy(expr("count desc")).show(2)
df.orderBy(col("count").desc(), col("DEST_COUNTRY_NAME").asc()).show(2)
Limit
python
df.limit(5).show()
df.orderBy(expr("count desc")).limit(6).show()