pyspark写法总结

往hive表中插入数据

法1：dataFrame数据写入hive表

def log2Hive():

log=hiveContext.createDataFrame([{"dt":dt,"types":types,"message":msg,"currtime":currTime}]).coalesce(1)#types: "INFO" ,"ERROR"

log.write.mode("append").insertInto("app.app_tion_log")

法2：dataFrame数据写入hive表

temp_vl_org=self.vl_data.join(self.order_slice,["po_no"]).coalesce(10)

os.system("hadoop fs -rm -r -skipTrash dev.db/" + str.lower(temp_vl))

hiveContext.sql("drop table if exists dev."+temp_vl)

temp_vlt_org.write.saveAsTable("dev."+temp_vl)

hiveContext写法

order_slice= hiveContext.table("app.app_ rage").coalesce(10).where(col("dt")==self.order_new_dt).select("po_no","goods_no",col("distribution_no").alias("dc_id")).distinct().coalesce(10)

log =hiveContext.createDataFrame([{"dt":dt,"types":types,"message":msg,"currtime":currTime}]).coalesce(1) #types:"INFO" ,"ERROR"

hiveContext.sql("showpartitions app.app_ rage").select(func.max("partition").alias("partition")).rdd.map(lambda x:str(x[0])).take(1)[0]

4.dataFrame数据写入hive表

temp_vl_org=self.vl_data.join(self.order_slice,["po_no"]).coalesce(10)

os.system("hadoop fs -rm -r -skipTrash dev.db/" +str.lower(temp_vl))

hiveContext.sql("drop table if exists dev."+temp_vl)

temp_vlt_org.write.saveAsTable("dev."+temp_vl)

python 中执行hadoop命令

#删除表

os.system("hadoop fs -rm -r -skipTrashdev.db/" + str.lower(temp_vl))

#从集市1路径localFolde拷贝数据到集市2Hive表分区数据tbFolder

localFolder="app.db/app_filter /"

partitonName="dt="+self.table_new_dt

tbFolder= "hdfs://102.1.1.1:8080/user/cm_pc/app.db/app_filter/"

os.system("hadoop fs -rm -r -skipTrash"+tbFolder+partitonName)

os.system("hadoop distcp"+localFolder+partitonName+" "+tbFolder+partitonName)

# localFolder+partitonName为源路径，tbFolder+partitonName为目标路径

dataFrame和rdd互转,存入Hive表

#以下返回dataFrame类型值

df = hc.sql(sql).coalesce(5)

#dataFrame转rdd

r2 = df.rdd.map(lambda row :((row[0],row[1]),(row[2],row[3],row[4],row[5]))).groupByKey().map(lambda(k, v): sub_process(k, v))

cond=["rowkey","top_10","dt"]

#rdd转dataFrame

result=r2.toDF(cond)

#dataframe存入hive表

result.write.mode("append").insertInto(table_name)

创建dataFrame方法

法1：

#rdd转dataFrame

result=r2.toDF(cond)

法2：

order_slice= hiveContext.table("app.apprage").coalesce(10).where(col("dt")==self.ord_new_dt).select("po_no","goods_no",col("distribution_no").alias("dc_id")).distinct().coalesce(10)

法3：

log =hiveContext.createDataFrame([{"dt":dt,"types":types,"message":msg,"currtime":currTime}]).coalesce(1) #types:"INFO" ,"ERROR"

法4：

hiveContext.sql("showpartitions app.apprage")

法5：

从hdfs中读取数据变成dataFrame

input_path为hdfs路径

textFile=sc.textFile(input_path).map(lambdax: x.split("\x01"))

df = textFile.toDF(["seller_no","seller_name", "dept_no", "goods_no","shop_id", "sp_goods_no", "dc_id","erp_warehouse_no","real_outstore_qty", "sp_create_time"]).groupby(["seller_no","seller_name", "dept_no", "goods_no","dc_id", "erp_warehouse_no","sp_create_time"]).agg(func.sum("real_outstore_qty").alias("real_outstore_qty")).where((col("sp_create_time")< endDate) & (col("sp_create_time") >"2015-01-01"))

pyspark when

dataDf = hiveContext.table(self.table_name).where((col("dt")==self.table_new_dt)& (col("so_status")<> "10028") &(col("so_status")<> "10009"))\

.select("seller_no","seller_name","dept_no",

"goods_no",col("shop_id").cast("int"),F.when(func.isnull("sp_goods_no"),-1).otherwise(col("sp_goods_no").cast("int")).alias("sp_goods_no"),

col("distribution_no").cast("int").alias("dc_id"),col("erp_warehouse_no").cast("int"),

col("sp_create_time").cast("date"),F.when(func.isnull("apply_out_qty"),1).otherwise(col("apply_out_qty")).alias("apply_qty"),).coalesce(10)

dataFrame添加列

法1,2

self.groupedDf=dataDf.groupBy("seller_no","dept_no","goods_no","shop_id","sp_goods_no","dc_id","erp_warehouse_no","sp_create_time")\

#添加两列

.agg(func.max("seller_name").alias("seller_name"),func.sum("apply_outstore_qty").alias("apply_outstore_qty"))\

.withColumn("dt",lit(self.table_new_dt))

判断是否执行成功

if __name__ == "__main__":

main()

r = os.system("hadoop fs -test-e /tmp/for/result/e_cast/_SUCCESS")

if r != 0:

raise Exception("1")

else:

print ("执行spark获取eclp预测数据成功！！")

从hdfs导入数据到hive

def push2hive(self):

hiveContext.sql("LOAD DATA IN PATH '/tmp/for/result/ec_forecast' OVERWRITE INTO TABLE app.app_ ver1 PARTITION( dt = '"+ _today+"')")

读取全路径下的Hive表

hdfs_path_ver1 ="/user/cmo_ipc/app.db/app_ ver1/dt=" + _today

df1=sc.textFile(hdfs_path_ver1).map(lambda x:x.split("\t")).map(lambdaline:((line[0],line[1],line[2],line[3]),line[8])).groupByKey().map(lambda(k,v): sales_nation_ver1(k,v))