DataFrame窗口函数操作

文章最前 : 我是Octopus,这个名字来源于我的中文名--章鱼;我热爱编程、热爱算法、热爱开源。所有源码在我的个人github ;这博客是记录我学习的点点滴滴,如果您对 Python、Java、AI、算法有兴趣,可以关注我的动态,一起学习,共同进步。

相关文章:

  1. PySpark 概述
  2. Spark连接快速入门
  3. Spark上使用pandas API快速入门

创建pyspark对象

python 复制代码
import warnings
warnings.filterwarnings('ignore')
#import pandas as pd
#import numpy as np
from datetime import timedelta, date, datetime
import time
import gc
import os
import argparse                             
import sys

from pyspark.sql import SparkSession, functions as fn
from pyspark.ml.feature import StringIndexer
from pyspark.ml.recommendation import ALS
from pyspark.sql.types import *
from pyspark import StorageLevel
spark = SparkSession \
        .builder \
        .appName("stockout_test") \
        .config("hive.exec.dynamic.partition.mode", "nonstrict") \
        .config("spark.sql.sources.partitionOverwriteMode", "dynamic")\
        .config("spark.driver.memory", '20g')\
        .config("spark.executor.memory", '40g')\
        .config("spark.yarn.executor.memoryOverhead", '1g')\
        .config("spark.executor.instances", 8)\
        .config("spark.executor.cores", 8)\
        .config("spark.kryoserializer.buffer.max", '128m')\
        .config("spark.yarn.queue", 'root.algo')\
        .config("spark.executorEnv.OMP_NUM_THREADS", 12)\
        .config("spark.executorEnv.ARROW_PRE_0_15_IPC_FORMAT", 1) \
        .config("spark.default.parallelism", 800)\
        .enableHiveSupport() \
        .getOrCreate()
spark.sql("set hive.exec.dynamic.partition.mode = nonstrict")
spark.sql("set hive.exec.dynamic.partition=true")
spark.sql("set spark.sql.autoBroadcastJoinThreshold=-1")

创建DataFrame

python 复制代码
employee_salary = [
    ("zhangsan", "IT", 8000),
    ("lisi", "IT", 7000),
    ("wangwu", "IT", 7500),
    ("zhaoliu", "ALGO", 10000),
    ("qisan", "IT", 8000),
    ("bajiu", "ALGO", 12000),
    ("james", "ALGO", 11000),
    ("wangzai", "INCREASE", 7000),
    ("carter", "INCREASE", 8000),
    ("kobe", "IT", 9000)]
 
columns= ["name", "department", "salary"]
df = spark.createDataFrame(data = employee_salary, schema = columns)
df.show()
复制代码
+--------+----------+------+
|    name|department|salary|
+--------+----------+------+
|zhangsan|        IT|  8000|
|    lisi|        IT|  7000|
|  wangwu|        IT|  7500|
| zhaoliu|      ALGO| 10000|
|   qisan|        IT|  8000|
|   bajiu|      ALGO| 12000|
|   james|      ALGO| 11000|
| wangzai|  INCREASE|  7000|
|  carter|  INCREASE|  8000|
|    kobe|        IT|  9000|
+--------+----------+------+

row_number()

python 复制代码
from pyspark.sql.window import Window
import pyspark.sql.functions as F
 
windowSpec  = Window.partitionBy("department").orderBy(F.desc("salary"))
df.withColumn("row_number", F.row_number().over(windowSpec)).show(truncate=False)
复制代码
+--------+----------+------+----------+
|name    |department|salary|row_number|
+--------+----------+------+----------+
|carter  |INCREASE  |8000  |1         |
|wangzai |INCREASE  |7000  |2         |
|kobe    |IT        |9000  |1         |
|zhangsan|IT        |8000  |2         |
|qisan   |IT        |8000  |3         |
|wangwu  |IT        |7500  |4         |
|lisi    |IT        |7000  |5         |
|bajiu   |ALGO      |12000 |1         |
|james   |ALGO      |11000 |2         |
|zhaoliu |ALGO      |10000 |3         |
+--------+----------+------+----------+

Rank()

python 复制代码
from pyspark.sql.window import Window
import pyspark.sql.functions as F
 
windowSpec  = Window.partitionBy("department").orderBy(F.desc("salary"))
df.withColumn("rank",F.rank().over(windowSpec)).show(truncate=False)
复制代码
+--------+----------+------+----+
|name    |department|salary|rank|
+--------+----------+------+----+
|carter  |INCREASE  |8000  |1   |
|wangzai |INCREASE  |7000  |2   |
|kobe    |IT        |9000  |1   |
|qisan   |IT        |8000  |2   |
|zhangsan|IT        |8000  |2   |
|wangwu  |IT        |7500  |4   |
|lisi    |IT        |7000  |5   |
|bajiu   |ALGO      |12000 |1   |
|james   |ALGO      |11000 |2   |
|zhaoliu |ALGO      |10000 |3   |
+--------+----------+------+----+

dense_rank()

python 复制代码
from pyspark.sql.window import Window
import pyspark.sql.functions as F
 
windowSpec  = Window.partitionBy("department").orderBy(F.desc("salary"))
df.withColumn("dense_rank",F.dense_rank().over(windowSpec)).show()
复制代码
+--------+----------+------+----------+
|    name|department|salary|dense_rank|
+--------+----------+------+----------+
|  carter|  INCREASE|  8000|         1|
| wangzai|  INCREASE|  7000|         2|
|    kobe|        IT|  9000|         1|
|   qisan|        IT|  8000|         2|
|zhangsan|        IT|  8000|         2|
|  wangwu|        IT|  7500|         3|
|    lisi|        IT|  7000|         4|
|   bajiu|      ALGO| 12000|         1|
|   james|      ALGO| 11000|         2|
| zhaoliu|      ALGO| 10000|         3|
+--------+----------+------+----------+

lag()

python 复制代码
from pyspark.sql.window import Window
import pyspark.sql.functions as F
 
windowSpec  = Window.partitionBy("department").orderBy(F.desc("salary"))
df.withColumn("lag",F.lag("salary",1).over(windowSpec)).show()
复制代码
+--------+----------+------+-----+
|    name|department|salary|  lag|
+--------+----------+------+-----+
|  carter|  INCREASE|  8000| null|
| wangzai|  INCREASE|  7000| 8000|
|    kobe|        IT|  9000| null|
|zhangsan|        IT|  8000| 9000|
|   qisan|        IT|  8000| 8000|
|  wangwu|        IT|  7500| 8000|
|    lisi|        IT|  7000| 7500|
|   bajiu|      ALGO| 12000| null|
|   james|      ALGO| 11000|12000|
| zhaoliu|      ALGO| 10000|11000|
+--------+----------+------+-----+

lead()

python 复制代码
from pyspark.sql.window import Window
import pyspark.sql.functions as F
 
windowSpec  = Window.partitionBy("department").orderBy(F.desc("salary"))
df.withColumn("lead",F.lead("salary", 1).over(windowSpec)).show()
复制代码
+--------+----------+------+-----+
|    name|department|salary| lead|
+--------+----------+------+-----+
|  carter|  INCREASE|  8000| 7000|
| wangzai|  INCREASE|  7000| null|
|    kobe|        IT|  9000| 8000|
|zhangsan|        IT|  8000| 8000|
|   qisan|        IT|  8000| 7500|
|  wangwu|        IT|  7500| 7000|
|    lisi|        IT|  7000| null|
|   bajiu|      ALGO| 12000|11000|
|   james|      ALGO| 11000|10000|
| zhaoliu|      ALGO| 10000| null|
+--------+----------+------+-----+

Aggregate Functions

python 复制代码
from pyspark.sql.window import Window
import pyspark.sql.functions as F
 
windowSpec  = Window.partitionBy("department").orderBy(F.desc("salary"))
windowSpecAgg  = Window.partitionBy("department")
 
df.withColumn("row", F.row_number().over(windowSpec)) \
  .withColumn("avg", F.avg("salary").over(windowSpecAgg)) \
  .withColumn("sum", F.sum("salary").over(windowSpecAgg)) \
  .withColumn("min", F.min("salary").over(windowSpecAgg)) \
  .withColumn("max", F.max("salary").over(windowSpecAgg)) \
  .withColumn("count", F.count("salary").over(windowSpecAgg)) \
  .withColumn("distinct_count", F.approx_count_distinct("salary").over(windowSpecAgg)) \
  .show()
复制代码
+--------+----------+------+---+-------+-----+-----+-----+-----+--------------+
|    name|department|salary|row|    avg|  sum|  min|  max|count|distinct_count|
+--------+----------+------+---+-------+-----+-----+-----+-----+--------------+
|  carter|  INCREASE|  8000|  1| 7500.0|15000| 7000| 8000|    2|             2|
| wangzai|  INCREASE|  7000|  2| 7500.0|15000| 7000| 8000|    2|             2|
|    kobe|        IT|  9000|  1| 7900.0|39500| 7000| 9000|    5|             4|
|zhangsan|        IT|  8000|  2| 7900.0|39500| 7000| 9000|    5|             4|
|   qisan|        IT|  8000|  3| 7900.0|39500| 7000| 9000|    5|             4|
|  wangwu|        IT|  7500|  4| 7900.0|39500| 7000| 9000|    5|             4|
|    lisi|        IT|  7000|  5| 7900.0|39500| 7000| 9000|    5|             4|
|   bajiu|      ALGO| 12000|  1|11000.0|33000|10000|12000|    3|             3|
|   james|      ALGO| 11000|  2|11000.0|33000|10000|12000|    3|             3|
| zhaoliu|      ALGO| 10000|  3|11000.0|33000|10000|12000|    3|             3|
+--------+----------+------+---+-------+-----+-----+-----+-----+--------------+
python 复制代码
from pyspark.sql.window import Window
import pyspark.sql.functions as F
# 需要注意的是 approx_count_distinct() 函数适用于窗函数的统计,
# 而在groupby中通常用countDistinct()来代替该函数,用来求组内不重复的数值的条数。
# approx_count_distinct()取的是近似的数值,不太准确,使用需注意 

windowSpec  = Window.partitionBy("department").orderBy(F.desc("salary"))
windowSpecAgg  = Window.partitionBy("department")
 
df.withColumn("row", F.row_number().over(windowSpec)) \
  .withColumn("avg", F.avg("salary").over(windowSpecAgg)) \
  .withColumn("sum", F.sum("salary").over(windowSpecAgg)) \
  .withColumn("min", F.min("salary").over(windowSpecAgg)) \
  .withColumn("max", F.max("salary").over(windowSpecAgg)) \
  .withColumn("count", F.count("salary").over(windowSpecAgg)) \
  .withColumn("distinct_count", F.approx_count_distinct("salary").over(windowSpecAgg)) \
  .where(F.col("row")==1).select("department","avg","sum","min","max","count","distinct_count") \
  .show()

+----------+-------+-----+-----+-----+-----+--------------+ |department| avg| sum| min| max|count|distinct_count| +----------+-------+-----+-----+-----+-----+--------------+ | INCREASE| 7500.0|15000| 7000| 8000| 2| 2| | IT| 7900.0|39500| 7000| 9000| 5| 4| | ALGO|11000.0|33000|10000|12000| 3| 3| +----------+-------+-----+-----+-----+-----+--------------+

相关推荐
zhojiew1 小时前
使用 Spark Connect 在 Amazon EMR on EC2 上实现远程 Spark开发
大数据·分布式·spark
大江东去浪淘尽千古风流人物3 小时前
【Kimera】MIT SPARK 实时度量-语义 SLAM 全栈解析:VIO + 鲁棒 PGO + 语义网格四模块架构与 EuRoC 实测深度剖析
大数据·架构·spark
大江东去浪淘尽千古风流人物3 小时前
【Kimera-VIO】MIT SPARK 实时度量-语义 VIO/SLAM:六模块并行架构与智能因子图优化深度解析
大数据·架构·spark
大江东去浪淘尽千古风流人物3 小时前
【Kimera-Semantics】实时三维语义重建深度解析:Fast/Merged 双路积分、对数概率体素 Bayesian 融合与 ROS 全链路实现
大数据·架构·spark
陆水A2 天前
运输时效预测模型:静态路由时效的计算与验证
大数据·人工智能·算法·spark·数据库开发·etl工程师
SeaTunnel2 天前
Apache SeaTunnel 4 月有何新动作?连接器增强与 Zeta 稳定性提升等亮点速览
大数据·数据仓库·spark·apache·seatunnel
淡定一生23332 天前
spark 3.3+ 之BloomFilter Runtime Filter
大数据·分布式·spark
howard20055 天前
3.7 Spark任务调度
spark·任务调度·stage划分
计算机毕业编程指导师7 天前
【计算机毕设推荐】Python+Hadoop+Spark共享单车数据可视化分析系统 毕业设计 选题推荐 毕设选题 数据分析 机器学习 数据挖掘
大数据·hadoop·python·计算机·数据挖掘·spark·课程设计