基于华为云的在线拍卖数据分析

实验设备与平台：

MapReduce服务 MRS

弹性公网IP

弹性云服务器ECS

基于aarch64架构的MySQL，MiniConda，Sqoop

1 数据集准备

删除csv文件中无意义的特征：ReturnsAccepted
通过WinSCP将csv文件传输到华为云文件系统/home/zkpk/raw/位置

mkdir /home/zkpk
mkdir /home/zkpk/raw

通过shell命令去除文件的首行字段

cd /home/zkpk/raw/
sed -i '1d' TrainingSet.csv
sed -i '1d' TestSet.csv

将csv文件上传到HDFS

hadoop fs -put TrainingSet.csv /zkpk/raw/
hadoop fs -put TestSet.csv /zkpk/raw/
hadoop fs -ls /zkpk/raw/

2 数据集预处理

启动hive，并创建数据库zkpk

SHOW DATABASES;
CREATE DATABASE zkpk;
SHOW DATABASES;

对训练集创建外部表traingingset_log并导入数据

CREATE EXTERNAL TABLE zkpk.trainingset_log(
EbayID STRING,
QuantitySold INT,
Price FLOAT,
PricePercent FLOAT,
StartingBidPercent FLOAT,
SellerName STRING,
SellerClosePercent DOUBLE,
Category INT,
PersonID STRING,
StartingBid FLOAT,
AvgPrice FLOAT,
EndDay STRING,
HitCount INT,
AuctionAvgHitCount INT,
ItemAuctionSellPercent INT,
SellerSaleAvgPriceRatio DOUBLE,
SellerAvg DOUBLE,
SellerItemAvg INT,
AuctionHitCountAvgRatio INT,
BestOffer DOUBLE,
IsHOF INT,
ItemListedCount INT,
AuctionCount INT,
AuctionSaleCount INT,
SellerAuctionCount INT,
SellerAuctionSaleCount INT,
AuctionMedianPrice FLOAT)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
STORED AS TEXTFILE;load data inpath '/zkpk/raw/TrainingSet.csv' into table trainingset_log;

对测试集创建外部表testset_log并导入数据

CREATE EXTERNAL TABLE zkpk.testset_log(
EbayID STRING,
QuantitySold INT,
Price FLOAT,
PricePercent FLOAT,
StartingBidPercent FLOAT,
SellerName STRING,
SellerClosePercent DOUBLE,
Category INT,
PersonID STRING,
StartingBid FLOAT,
AvgPrice FLOAT,
EndDay STRING,
HitCount INT,
AuctionAvgHitCount INT,
ItemAuctionSellPercent INT,
SellerSaleAvgPriceRatio DOUBLE,
SellerAvg DOUBLE,
SellerItemAvg INT,
AuctionHitCountAvgRatio INT,
BestOffer DOUBLE,
IsHOF INT,
ItemListedCount INT,
AuctionCount INT,
AuctionSaleCount INT,
SellerAuctionCount INT,
SellerAuctionSaleCount INT,
AuctionMedianPrice FLOAT)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
STORED AS TEXTFILE;load data inpath '/zkpk/raw/TestSet.csv' into table testset_log;

3 数据集分析处理

统计 TrainingSet 中拍买成功交易的平均成交价并保存

CREATE TABLE avg_price(avg_price FLOAT)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' STORED AS TEXTFILE;Insert OVERWRITE TABLE avg_price
SELECT avg(Price) FROM trainingset_log WHERE QuantitySold=1;

统计 TrainingSet 中金牌卖家的拍卖成功率，降序排列并保存

CREATE TABLE success_rate_temp(SellerName STRING,Rate DOUBLE)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' STORED AS TEXTFILE;INSERT OVERWRITE TABLE success_rate_temp
SELECT SellerName,sum(QuantitySold)/count(QuantitySold)
FROM trainingset_log WHERE IsHOF=1 GROUP BY SellerName;CREATE TABLE success_rate_desc(SellerName STRING,Rate DOUBLE)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' STORED AS TEXTFILE;INSERT OVERWRITE TABLE success_rate_desc
SELECT * FROM success_rate_temp ORDER BY Rate DESC;drop table success_rate_temp;

统计TrainingSet和TestSet中周一到周日，每天拍卖成功的数量及拍卖成功率并保存

CREATE TABLE train_day_rate(EndDay STRING,Success INT,Rate DOUBLE)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' STORED AS TEXTFILE;INSERT OVERWRITE TABLE train_day_rate
SELECT EndDay,sum(QuantitySold),sum(QuantitySold)/count(QuantitySold)
FROM trainingset_log GROUP BY EndDay;

CREATE TABLE test_day_rate(EndDay STRING,Success INT,Rate DOUBLE)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' STORED AS TEXTFILE;INSERT OVERWRITE TABLE test_day_rate
SELECT EndDay,sum(QuantitySold),sum(QuantitySold)/count(QuantitySold)
FROM testset_log GROUP BY EndDay;

筛选出TrainingSet和TestSet数据中的EbayID，Quantitiysold字段，保存为train_label文件和test_label文件

CREATE TABLE train_label(EbayID STRING, QuantitySold Int)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
STORED AS TEXTFILE;INSERT OVERWRITE TABLE train_label SELECT
EbayID, QuantitySold FROM trainingset_log;

CREATE TABLE test_label(EbayID STRING, QuantitySold Int)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
STORED AS TEXTFILE;INSERT OVERWRITE TABLE test_label SELECT
EbayID, QuantitySold FROM testset_log;

从TrainingSet和TestSet数据中删除的SellerName，QuantiySold，EndDay字段，并将数据导出保存为train_data文件和test_data文件

CREATE TABLE train_data (
EbayID STRING,
Price FLOAT,
PricePercent FLOAT,
StartingBidPercent FLOAT,
SellerClosePercent DOUBLE,
Category INT,
PersonID STRING,
StartingBid FLOAT,
AvgPrice FLOAT,
HitCount INT,
AuctionAvgHitCount INT,
ItemAuctionSellPercent INT,
SellerSaleAvgPriceRatio DOUBLE,
SellerAvg DOUBLE,
SellerItemAvg INT,
AuctionHitCountAvgRatio INT,
BestOffer DOUBLE,
IsHOF INT,
ItemListedCount INT,
AuctionCount INT,
AuctionSaleCount INT,
SellerAuctionCount INT,
SellerAuctionSaleCount INT,
AuctionMedianPrice FLOAT)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
STORED AS TEXTFILE;INSERT OVERWRITE TABLE train_data SELECT
EbayID,Price,PricePercent,StartingBidPercent,SellerClosePercent,Category,PersonID,StartingBid,
AvgPrice,HitCount,AuctionAvgHitCount,ItemAuctionSellPercent,SellerSaleAvgPriceRatio,SellerAvg,
SellerItemAvg,AuctionHitCountAvgRatio,BestOffer,IsHOF,ItemListedCount,AuctionCount,
AuctionSaleCount,SellerAuctionCount,SellerAuctionSaleCount,AuctionMedianPrice
FROM trainingset_log;

CREATE TABLE test_data (
EbayID STRING,
Price FLOAT,
PricePercent FLOAT,
StartingBidPercent FLOAT,
SellerClosePercent DOUBLE,
Category INT,
PersonID STRING,
StartingBid FLOAT,
AvgPrice FLOAT,
HitCount INT,
AuctionAvgHitCount INT,
ItemAuctionSellPercent INT,
SellerSaleAvgPriceRatio DOUBLE,
SellerAvg DOUBLE,
SellerItemAvg INT,
AuctionHitCountAvgRatio INT,
BestOffer DOUBLE,
IsHOF INT,
ItemListedCount INT,
AuctionCount INT,
AuctionSaleCount INT,
SellerAuctionCount INT,
SellerAuctionSaleCount INT,
AuctionMedianPrice FLOAT)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
STORED AS TEXTFILE;INSERT OVERWRITE TABLE test_data SELECT
EbayID,Price,PricePercent,StartingBidPercent,SellerClosePercent,Category,PersonID,StartingBid,
AvgPrice,HitCount,AuctionAvgHitCount,ItemAuctionSellPercent,SellerSaleAvgPriceRatio,SellerAvg,
SellerItemAvg,AuctionHitCountAvgRatio,BestOffer,IsHOF,ItemListedCount,AuctionCount,
AuctionSaleCount,SellerAuctionCount,SellerAuctionSaleCount,AuctionMedianPrice
FROM testset_log;

4 数据集导出

安装并配置MySQL环境

# 安装wget
yum -y install wget
# 远程下载MySQL压缩包
wget https://mirrors.tuna.tsinghua.edu.cn/mysql/downloads/MySQL-8.0/mysql-8.0.26-1.el8.aarch64.rpm-bundle.tar
# 解压MySQL并安装
tar -xvf mysql-8.0.26-1.el8.aarch64.rpm-bundle.tar
yum install *.rpm
# 启动MySQL服务并查看运行状态
systemstl start mysql
systemstl status mysql
# 设置开机启动MySQL服务
systemctl enable mysqld
systemctl daemon-reload
# 查看临时数据库密码
grep 'temporary password' /var/log/mysqld.log
# 修改密码 未修改密码策略
ALTER USER 'root'@'localhost' IDENTIFIED BY '20001215,Cj';
# 启动MySQL服务
mysql -uroot -p

设置远程连接用户：通过主机的Navicat for MySQL连接云服务器的MySQL

# 创建远程连接用户
create user 'zkpk'@'%' identified by '20001215,Cj';
# 授予用户权限
grant all on *.* to 'zkpk'@'%';
# 更改加密方式
ALTER USER 'zkpk'@'%' IDENTIFIED BY '20001215,Cj' PASSWORD EXPIRE NEVER;
# 刷新权限
flush privileges;

从HDFS导出数据到本地文件系统

# 查看表格在HDFS上的存储位置
show create table 表格名;

# 查看表格在HDFS上的数据存放位置下的文件内容
hadoop fs -ls location
# 导出表格到本地文件系统
hadoop fs -get location /home/zkpk/

从HDFS导出数据到MySQL数据库

# 获取表格在HDFS的存储位置
use zkpk;
show create table 表格名;

# 将HDFS中文件存储到本地文件系统
hadoop fs -get location /home/zkpk/result/
# 登录MySQL
mysql --local_infile=1 -u root -p

# 在MySQL中创建对应表格
# 加载本地文件系统的表格到MySQL
LOAD DATA LOCAL INFILE '/home/zkpk/result/textfile' INTO TABLE zkpk.表格名 FIELDS TERMINATED BY ',' LINES TERMINATED BY '\n';

5 数据可视化展示

安装配置miniconda，并管理依赖包

wget https://mirrors.tuna.tsinghua.edu.cn/anaconda/miniconda/Miniconda3-py39_4.11.0-Linux-aarch64.sh
sh Miniconda3-py39_4.11.0-Linux-aarch64.sh

# 进入conda环境
conda activate
# 加载依赖项
conda install 依赖项
# 运行python文件
python 路径+文件名
# 退出conda环境
conda deactivate

编写可视化程序

import pymysql
import pandas as pd
import matplotlib.pyplot as pltdb = pymysql.connect(host='localhost', user='root', password='20001215,Cj', database='zkpk')
cursor = db.cursor()
sql = 'select EndDay, Success from train_day_rate'
# sql = 'select EndDay, Rate from train_day_rate'
# sql = 'select EndDay, Success from test_day_rate'
# sql = 'select EndDay, Rate from test_day_rate'
cursor.execute(sql)
data = cursor.fetchall()
cursor.close()
db.close()
print(data)
df = pd.DataFrame(list(data),columns=['endDay', 'amount'])
plt.figure
plt.xlabel('day')
plt.ylabel('amount')
plt.bar(df['endDay'],df['amount'])
plt.show()

华为云DLV组件实现可视化展示

训练集拍卖成功数量与比例柱状图

测试集拍卖成功数量与比例柱状图

6 拍卖成功率预测

从数据库获取数据集train_label，train_data，test_label，test_data

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pymysqldef getdataset():# 连接数据库# db = pymysql.connect(host='localhost', user='root', password='20001215,Cj', database='zkpk')db = pymysql.connect(host='124.70.59.198', user='zkpk', password='20001215,Cj', database='zkpk')cursor = db.cursor()# 读取数据集 训练集sql = 'select * from train_data'cursor.execute(sql)data = cursor.fetchall()train_set= pd.DataFrame(list(data),columns=['EbayID','Price', 'PricePercent', 'StartingBidPercent','SellerClosePercent','Category','PersonID','StartingBid', 'AvgPrice', 'HitCount',          'AuctionAvgHitCount','ItemAuctionSellPercent', 'SellerSaleAvgPriceRatio','SellerAvg', 'SellerItemAvg', 'AuctionHitCountAvgRatio','BestOffer', 'IsHOF', 'ItemListedCount','AuctionCount', 'AuctionSaleCount','SellerAuctionCount', 'SellerAuctionSaleCount','AuctionMedianPrice'])print("the shape of train_set:", train_set.shape)#训练集labelsql = 'select QuantitySold from train_label'cursor.execute(sql)data = cursor.fetchall()train_label= pd.DataFrame(list(data),columns=['QuantitySold'])print("the shape of train_label:", train_label.shape)#测试集sql = 'select * from test_data'cursor.execute(sql)data = cursor.fetchall()test_set= pd.DataFrame(list(data),columns=['EbayID','Price', 'PricePercent', 'StartingBidPercent','SellerClosePercent','Category','PersonID','StartingBid', 'AvgPrice', 'HitCount','AuctionAvgHitCount','ItemAuctionSellPercent', 'SellerSaleAvgPriceRatio','SellerAvg', 'SellerItemAvg', 'AuctionHitCountAvgRatio','BestOffer', 'IsHOF', 'ItemListedCount','AuctionCount', 'AuctionSaleCount','SellerAuctionCount', 'SellerAuctionSaleCount','AuctionMedianPrice'])#测试集labelsql = 'select QuantitySold from test_label'cursor.execute(sql)data = cursor.fetchall()test_label= pd.DataFrame(list(data),columns=['QuantitySold'])cursor.close()db.close()#去掉与拍卖成功概率无关的特征EbayIDtrain_data = train_set.drop(['EbayID'], axis=1)test_data = test_set.drop(['EbayID'], axis=1)n_items, n_features = train_data.shape#the number of total featurestrain_data.head()return train_set, train_data, train_label, test_data, test_label

拍卖成功预测

随机小批量梯度下降法：minibatchSGDClassification.py

import pandas as pd
import matplotlib.pyplot as plt
import pymysql
import numpy as np
from GetDataSet import getdataset
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_score, recall_score, f1_scoretrain_set, train_data, train_label, test_data, test_label = getdataset()# The results of mini_batch learning for SGDClassifier in the training process were drawn
def plot_learning(clf, title):plt.figure()# Record the prediction of the last training result in this trainingvalidationScore = []# Record the forecast situation after adding this training resulttrainScore = []# Minimum training frequencymini_batch = 1000n_items = train_set.shape[0]for i in range(int(np.ceil(n_items / mini_batch))):x_batch = train_data[i * mini_batch: min((i + 1) * mini_batch, n_items)]y_batch = train_label[i * mini_batch: min((i + 1) * mini_batch, n_items)]if i > 0:validationScore.append(clf.score(x_batch, y_batch))clf.partial_fit(x_batch, y_batch, classes=range(5))if i > 0:trainScore.append(clf.score(x_batch, y_batch))plt.plot(trainScore, label="train_score")plt.plot(validationScore, label="validation_score")plt.xlabel("Mini_batch")plt.ylabel("Score")plt.grid()plt.title(title)plt.savefig('test.jpg')# 对训练数据进行正则化
scaler = StandardScaler()
train_data = scaler.fit_transform(train_set.drop(['EbayID'], axis=1))#SGD二分类
clf = SGDClassifier(penalty='l2', alpha=0.0004)
plot_learning(clf, 'SGDClassifier')test_data = scaler.fit_transform(test_data)
train_pred = clf.predict(train_data)
test_pred = clf.predict(test_data)print("SGDClassifier training performance on testing dataset:")
print("\tPrecision：%1.3f" % precision_score(test_label, test_pred, average='micro'))
print("\tRecall：%1.3f" % recall_score(train_label, train_pred))
print("\tF1：%1.3f \n" % f1_score(train_label, train_pred))

决策树：ExecutiveTree

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pymysql
from sklearn import tree
from GetDataSet import getdatasettrain_set, train_data, train_label, test_data, test_label = getdataset()
clf = tree.DecisionTreeClassifier()
clf = clf.fit(train_data, train_label)
print("Precision: ", clf.score(test_data, test_label))

逻辑回归：LogisticRegression.py

import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression as LR
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from GetDataSet import getdataset#获取数据集
train_set, train_data, train_label, test_data, test_label = getdataset()#对train_data进行标准化
std = StandardScaler()
train_data = std.fit_transform(train_data)#进行逻辑回归二分类
lg = LR(C=1.0)
lg.fit(train_data, train_label)test_predict = lg.predict(test_data)print("准确率: " ,lg.score(test_data, test_label))

基于华为云的在线拍卖数据分析相关推荐

光大银行刘淼：基于华为云GaussDB(DWS) 数据仓库创新实践
摘要:面向未来数据平台3.0要做架构减法,平台由N->1,华为云GaussDB(DWS)未来作为数据仓库唯一平台,数据链路实现从数据湖直接到华为云GaussDB(DWS)数据仓库. 日前,华为举 ...
基于华为云ECS的目标检测与识别的昇腾AI开发体验【华为云至简致远】
[摘要] 基于华为云弹性云服务器ECS,搭配共享镜像,快速搭建环境,进行目标检测与识别的昇腾AI开发体验,开箱即用,打破时间和空间限制,随时随地进行开发,适合个人开发和团队协作,体验流畅丝滑. 前言 ...
永洪BI携手华为云FusionInsight，让数据分析提效20倍
9月23日,华为全联接2021正式开幕,以"深耕数字化"为主题,各行业领军人物分享最新成果与实践.其中在"华为云Stack构筑繁荣行业生态,让伙伴用好云"分论坛 ...
【华为云技术分享】基于华为云IOT及无线RFID技术的智慧仓储解决方案最佳实践系列一
摘要:仓储管理存在四大细分场景:出入库管理.盘点.分拣和货物跟踪.本系列将介绍利用华为云IOT全栈云服务,端侧采用华为收发分离式RFID解决方案,打造端到端到IOT智慧仓储解决方案的最佳实践. 仓储是 ...
基于华为云ModelArts平台利用MobileNetV2算法实现5类花卉分类
*************************************************** 码字不易,收藏之余,别忘了给我点个赞吧! *************************** ...
往期直播：《驴妈妈，基于混合云的OTA行业数据分析、精准运营和大数据用户推荐》...
最新活动报名: 3月16日直播<空格技术架构云上实践与经验>:https://yq.aliyun.com/webinar/join/4 3月18日直播<爆款App小咖秀产品服务端架构 ...
基于华为云的Django网站部署
学习笔记,仅供参考,有错必纠参考自:Xshell远程连接服务器:win10 开启ssh server服务远程登录文章目录基于华为云的Django网站部署远程登录华为云 Xftp+Xshell ...
基于华为云对话机器人技能平台的规则模板概述
华为云对话机器人服务中的技能平台,向机器人服务提供了一系列即插即用的技能,可为对话机器人增加了用户配置机器人的灵活度. 在邀测的过程中,我们总结了在技能平台上使用的规则配置上的一些功能和想法,与大家分 ...
基于华为云的一个典型的持续部署方案
云社区博客博客详情基于华为云的一个典型的持续部署方案 [摘要] 华为云迄今为止已经有14大类超过100种服务了,可以做很多有用和好玩的方案. 基于华为云的一个典型的持续部署方案:Function ...

基于华为云的在线拍卖数据分析

1 数据集准备

2 数据集预处理

3 数据集分析处理

4 数据集导出

5 数据可视化展示

6 拍卖成功率预测

基于华为云的在线拍卖数据分析相关推荐

最新文章

热门文章