pyspark 初试

1、安装jdk

2、安装spark

复制代码
curl -o spark.tgz https://mirrors.tuna.tsinghua.edu.cn/apache/spark/spark-4.0.0/spark-4.0.0-bin-hadoop3.tgz
tar -xvf spark.tgz
mv spark-4.0.0-bin-hadoop3 /opt/spark

export SPARK_HOME=/opt/spark

export PATH=PATH:SPARK_HOME/bin:SPARK_HOME/sbin

source /etc/profile

spark-shell

复制代码
#import findspark
#findspark.init()
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('test').getOrCreate()
#df =spark.read.text("name.txt")
#df.show(2)

df =spark.read.csv("911.csv",header=True,inferSchema=True)
df.show(5)
df.head(5)
df.printSchema()
df.count()
df.describe().show()
df.sample(frction=0.05).show()

row=df.head() #只获取一行
row.asDict() #转成字典
df.columns #打印列  column  只是列的描述
#查询
df.select(df['salary'],((df['salary'] * 0.1).alias('bonus'))).show()