multiply

from pyspark import SparkConf, SparkContext
sc = SparkContext()

# 累乘
nums = sc.parallelize([1, 2, 3 ,4, 5])
mult = nums.fold(1,  (lambda x, y : x *y))
print(mult)

# 累加
accumulate = nums.fold(0, (lambda x, y : x + y))
print(accumulate)

sort_by key

! cat ./data.txt

crazy crazy fox jumped
crazy for jumped
fox is fast
fox is smart
dog is smart

lines = sc.textFile('data.txt', 1)
lines.collect()

['crazy crazy fox jumped','crazy for jumped','fox is fast ','fox is smart','dog is smart']

# 词频统计
# flatMap ：展平嵌套的可迭代对象
frequencies = lines.flatMap(lambda x : x.split(' ')).map(lambda x : (x, 1)).reduceByKey(lambda x, y : x +y)
frequencies.collect()

[('crazy', 3),('fox', 3),('jumped', 2),('for', 1),('is', 3),('fast', 1),('', 1),('smart', 2),('dog', 1)]

frequencies.count()

lines.flatMap(lambda x : x.split(' ')).collect()

['crazy','crazy','fox','jumped','crazy','for','jumped','fox','is','fast','','fox','is','smart','dog','is','smart']

lines.flatMap(lambda x : x.split(' ')).map(lambda x : (x, 1)).collect()

[('crazy', 1),('crazy', 1),('fox', 1),('jumped', 1),('crazy', 1),('for', 1),('jumped', 1),('fox', 1),('is', 1),('fast', 1),('', 1),('fox', 1),('is', 1),('smart', 1),('dog', 1),('is', 1),('smart', 1)]

lines.collect()

['crazy crazy fox jumped','crazy for jumped','fox is fast ','fox is smart','dog is smart']

lines.flatMap(lambda x : x.split(' ')).map(lambda x : (x, 1)).reduceByKey(lambda x, y : x + y).collect()

[('crazy', 3),('fox', 3),('jumped', 2),('for', 1),('is', 3),('fast', 1),('', 1),('smart', 2),('dog', 1)]

sum

nums = sc.parallelize([1, 2, 3, 4, 5, 6, 7, 8])
Sum = nums.fold(0, (lambda x, y : x+y))print(Sum)

union

r1 = sc.parallelize([('k1', 1), ('k2', 2), ('k3', 3)])
r2 = sc.parallelize([('k1', 3), ('k2', 4), ('k4', 8)])
r3 = r1.union(r2)print('r3 :', r3.collect())r4 = r3.reduceByKey(lambda x, y : x + y)print('r4 :', r4.collect())

r3 : [('k1', 1), ('k2', 2), ('k3', 3), ('k1', 3), ('k2', 4), ('k4', 8)]
r4 : [('k1', 4), ('k3', 3), ('k4', 8), ('k2', 6)]

Word frequency

!cat './data.txt'

crazy crazy fox jumped over the fence
crazy fox jumped
the fence is high of fox
crazy fox is smart
fox jumped very high

lines2 = sc.textFile('./data.txt')
print(lines2.collect())

['crazy crazy fox jumped over the fence', 'crazy fox jumped', 'the fence is high of fox', 'crazy fox is smart', 'fox jumped very high']

lines2 = lines.map(lambda x : x.split(' '))
print('lines2 is :')
print(lines2.collect())

lines2 is :
[['crazy', 'crazy', 'fox', 'jumped', 'over', 'the', 'fence'], ['crazy', 'fox', 'jumped'], ['the', 'fence', 'is', 'high', 'of', 'fox']]

bigrams_list  = lines2.fold([], lambda x ,y : x+y)
bigrams_list = sc.parallelize(bigarms_list)
print('bigrams list :')
print(bigrams_list.collect())

bigarms list :
['crazy', 'crazy', 'fox', 'jumped', 'over', 'the', 'fence', 'crazy', 'fox', 'jumped', 'the', 'fence', 'is', 'high', 'of', 'fox']

word_counts = bigrams_list.map(lambda x : (x, 1)).reduceByKey(lambda x, y: x+y)n_words = word_counts.count()
word_frequency =word_counts.map(lambda x : (x[0],float(x[1]/n_words)))
print('word frequency')
print(word_frequency.collect())

word frequency
[('crazy', 0.3333333333333333), ('of', 0.1111111111111111), ('jumped', 0.2222222222222222), ('high', 0.1111111111111111), ('fence', 0.2222222222222222), ('fox', 0.3333333333333333), ('over', 0.1111111111111111), ('is', 0.1111111111111111), ('the', 0.2222222222222222)]

pyspark:basic_operating_1相关推荐

pyspark汇总小结
20220402 Spark报Total size of serialized results of 12189 tasks is bigger than spark.driver.maxResult ...
spark- PySparkSQL之PySpark解析Json集合数据
PySparkSQL之PySpark解析Json集合数据数据样本 12341234123412342|asefr-3423|[{"name":"spark", ...
jupyter笔记本_如何为Jupyter笔记本电脑设置PySpark
jupyter笔记本 by Tirthajyoti Sarkar 由Tirthajyoti Sarkar 如何为Jupyter笔记本电脑设置PySpark (How to set up PySpark ...
手把手教你实现PySpark机器学习项目——回归算法
作者 | hecongqing 来源 | AI算法之心(ID:AIHeartForYou) [导读]PySpark作为工业界常用于处理大数据以及分布式计算的工具,特别是在算法建模时起到了非常大的作用. ...
pyspark常用API
union 和unionall union 纵向合并dataframe In this Spark article, you will learn how to union two or more d ...
利用PySpark进行迁移学习的多类图像分类
在本文中,我们将演示计算机视觉问题,它具有结合两种最先进技术的能力:深度学习和Apache Spark.我们将利用深度学习管道的强大功能来解决多类图像分类问题. PySpark 是 Spark 为 P ...
pyspark dataframe数据连接（join）、转化为pandas dataframe、基于多个字段删除冗余数据
pyspark dataframe数据连接(join).转化为pandas dataframe.基于多个字段删除冗余数据目录 pyspark dataframe数据连接(join).转化为panda ...
基于关联规则（Variational Autoencoders）疾病预测系统实战：(pyspark FPGrowth实现频繁项集挖掘、最后给出预测模型topK准确率和召回率)
基于关联规则(Variational Autoencoders)疾病预测系统实战:(pyspark FPGrowth实现频繁项集挖掘.最后给出预测模型topK准确率和召回率) 目录
pyspark sparksession_pyspark中的行列互转
行列互转在数据分析与挖掘中是经常遇到的问题,这篇文章总结了pyspark中行列互转的方法,首先我们先创建测试数据集. from 列转行 pivot 实现透视操作简单直接,逻辑如下按照不需要转换的字 ...
独家 | 一文读懂PySpark数据框（附实例）
作者:Kislay Keshari 翻译:季洋校对:倪骁然本文约1900字,建议阅读8分钟. 本文中我们将探讨数据框的概念,以及它们如何与PySpark一起帮助数据分析员来解读大数据集. 数据框是 ...

pyspark:basic_operating_1

multiply

sort_by key

sum

union

Word frequency

pyspark:basic_operating_1相关推荐

最新文章

热门文章