技术标签: Spark学习之路 tensorflow python spark
# import sys
# sys.path.append(r'/home/ZSX/anaconda3/envs/tensorflow/lib/python3.8/site-packages/')
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from matplotlib import pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
读者需要按自己的文件路径更换取值
raw_data = pd.read_csv("./bitstampUSD.csv")
查看原始数据
raw_data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4572257 entries, 0 to 4572256
Data columns (total 8 columns):
# Column Dtype
--- ------ -----
0 Timestamp int64
1 Open float64
2 High float64
3 Low float64
4 Close float64
5 Volume_(BTC) float64
6 Volume_(Currency) float64
7 Weighted_Price float64
dtypes: float64(7), int64(1)
memory usage: 279.1 MB
在的数据一共有2099760条,数据由Timestamp、Open、High、Low、Close、Volume_(BTC)、Volume_(Currency)、Weighted_Price这几列组成。其中除去Timestamp列以外,其余的数据列都是float64数据类型。
现在查看前10行数据
raw_data.head(10)
| Timestamp | Open | High | Low | Close | Volume_(BTC) | Volume_(Currency) | Weighted_Price | |
|---|---|---|---|---|---|---|---|---|
| 0 | 1325317920 | 4.39 | 4.39 | 4.39 | 4.39 | 0.455581 | 2.0 | 4.39 |
| 1 | 1325317980 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 2 | 1325318040 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 3 | 1325318100 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 4 | 1325318160 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 5 | 1325318220 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 6 | 1325318280 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 7 | 1325318340 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 8 | 1325318400 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 9 | 1325318460 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
# 删除包含NaN值的任何行
data = raw_data.dropna(axis = 0)
data.head(10)
| Timestamp | Open | High | Low | Close | Volume_(BTC) | Volume_(Currency) | Weighted_Price | |
|---|---|---|---|---|---|---|---|---|
| 0 | 1325317920 | 4.39 | 4.39 | 4.39 | 4.39 | 0.455581 | 2.000000 | 4.390000 |
| 478 | 1325346600 | 4.39 | 4.39 | 4.39 | 4.39 | 48.000000 | 210.720000 | 4.390000 |
| 547 | 1325350740 | 4.50 | 4.57 | 4.50 | 4.57 | 37.862297 | 171.380338 | 4.526411 |
| 548 | 1325350800 | 4.58 | 4.58 | 4.58 | 4.58 | 9.000000 | 41.220000 | 4.580000 |
| 1224 | 1325391360 | 4.58 | 4.58 | 4.58 | 4.58 | 1.502000 | 6.879160 | 4.580000 |
| 1896 | 1325431680 | 4.84 | 4.84 | 4.84 | 4.84 | 10.000000 | 48.400000 | 4.840000 |
| 2333 | 1325457900 | 5.00 | 5.00 | 5.00 | 5.00 | 10.100000 | 50.500000 | 5.000000 |
| 3612 | 1325534640 | 5.00 | 5.00 | 5.00 | 5.00 | 19.048000 | 95.240000 | 5.000000 |
| 4553 | 1325591100 | 5.32 | 5.32 | 5.32 | 5.32 | 2.419173 | 12.870000 | 5.320000 |
| 4710 | 1325600520 | 5.14 | 5.14 | 5.14 | 5.14 | 0.680000 | 3.495200 | 5.140000 |
先查看下数据是否含有nan的数据,可以看到我们的数据中没有nan的数据
data.isnull().sum()
Timestamp 0
Open 0
High 0
Low 0
Close 0
Volume_(BTC) 0
Volume_(Currency) 0
Weighted_Price 0
dtype: int64
可以看出现在已经没有NaN的数据了
再查看下0数据,可以看到我们的数据中含有0值,我们需要对0值做下处理
(data == 0).astype(int).any()
Timestamp False
Open False
High False
Low False
Close False
Volume_(BTC) True
Volume_(Currency) True
Weighted_Price False
dtype: bool
处理0数据的方式是使用上个列值进行前向填充
data['Weighted_Price'].replace(0, np.nan, inplace=True)
data['Weighted_Price'].fillna(method='ffill', inplace=True)
data['Open'].replace(0, np.nan, inplace=True)
data['Open'].fillna(method='ffill', inplace=True)
data['High'].replace(0, np.nan, inplace=True)
data['High'].fillna(method='ffill', inplace=True)
data['Low'].replace(0, np.nan, inplace=True)
data['Low'].fillna(method='ffill', inplace=True)
data['Close'].replace(0, np.nan, inplace=True)
data['Close'].fillna(method='ffill', inplace=True)
data['Volume_(BTC)'].replace(0, np.nan, inplace=True)
data['Volume_(BTC)'].fillna(method='ffill', inplace=True)
data['Volume_(Currency)'].replace(0, np.nan, inplace=True)
data['Volume_(Currency)'].fillna(method='ffill', inplace=True)
(data == 0).astype(int).any()
Timestamp False
Open False
High False
Low False
Close False
Volume_(BTC) False
Volume_(Currency) False
Weighted_Price False
dtype: bool
再看下数据的分布跟走势,这个时候曲线已经非常的连续
plt.plot(data['Weighted_Price'], label='Price')
plt.ylabel('Price')
plt.legend()
plt.show()
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-yWSw9JEy-1654700137532)(output_24_0.png)]
将数据归一化到0-1
"""
data:Pandas.DataFrame --> data_set:numpy.nArray
#1.刪除掉时间索引列
#2.把所有取值转换成浮点数
#3.统一的进行归一化处理
"""
data_set = data.drop('Timestamp', axis=1).values
data_set = data_set.astype('float32')
mms = MinMaxScaler(feature_range=(0, 1))
data_set = mms.fit_transform(data_set)
以2:8划分测试数据集跟训练数据集
"""
划分训练集和测试集
"""
ratio = 0.8
train_size = int(len(data_set) * ratio)
test_size = len(data_set) - train_size
train, test = data_set[0:train_size,:], data_set[train_size:len(data_set),:]
创建训练数据集跟测试数据集,以1天作为窗口期来创建我们的训练数据集跟测试数据集。
def create_dataset(data):
window = 1
label_index = 6
x, y = [], []
for i in range(len(data) - window):
x.append(data[i:(i + window), :])
y.append(data[i + window, label_index])
return np.array(x), np.array(y)
train_x, train_y = create_dataset(train)
test_x, test_y = create_dataset(test)
loss为平均绝对误差(Mean Absolute Error,MAE)
def create_model():
model = Sequential()
model.add(LSTM(50, input_shape=(train_x.shape[1], train_x.shape[2])))
model.add(Dense(1))
model.compile(loss='mae', optimizer='adam')
model.summary()
return model
model = create_model()
Model: "sequential"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
lstm (LSTM) (None, 50) 11600
dense (Dense) (None, 1) 51
=================================================================
Total params: 11,651
Trainable params: 11,651
Non-trainable params: 0
_________________________________________________________________
这里节约时间,只训练3代,推荐迭代20~30次,效果会更好
history = model.fit(train_x, train_y, epochs=3, batch_size=64, validation_data=(test_x, test_y), verbose=1, shuffle=False)
Epoch 1/3
41632/41632 [==============================] - 109s 3ms/step - loss: 0.0028 - val_loss: 0.0523
Epoch 2/3
41632/41632 [==============================] - 111s 3ms/step - loss: 0.0019 - val_loss: 0.0358
Epoch 3/3
41632/41632 [==============================] - 111s 3ms/step - loss: 0.0012 - val_loss: 0.0264
plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='test')
plt.legend()
plt.show()
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-GLlCzMAZ-1654700137535)(output_38_0.png)]
在测试集上预测,并绘制真实值和预测值的对比图
predict = model.predict(test_x)
plt.plot(predict, label='predict')
plt.plot(test_y, label='ground true')
plt.legend()
plt.show()
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-MBWqGjxR-1654700137536)(output_40_0.png)]
model.save("./saved_model/",save_format="tf")
WARNING:absl:Found untraced functions such as lstm_cell_layer_call_fn, lstm_cell_layer_call_and_return_conditional_losses, lstm_cell_layer_call_fn, lstm_cell_layer_call_and_return_conditional_losses, lstm_cell_layer_call_and_return_conditional_losses while saving (showing 5 of 5). These functions will not be directly callable after loading.
INFO:tensorflow:Assets written to: ./saved_model/assets
INFO:tensorflow:Assets written to: ./saved_model/assets
WARNING:absl:<keras.layers.recurrent.LSTMCell object at 0x7f0e246d9430> has the same name 'LSTMCell' as a built-in Keras object. Consider renaming <class 'keras.layers.recurrent.LSTMCell'> to avoid naming conflicts when loading with `tf.keras.models.load_model`. If renaming is not possible, pass the object in the `custom_objects` parameter of the load function.
from pyspark.sql.types import *
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, ArrayType, LongType
from pyspark.sql.functions import pandas_udf,udf,PandasUDFType
import pyspark.sql.functions as F
spark=SparkSession.builder.master("local[4]").appName("PredictBTC").getOrCreate()
df_test=spark.createDataFrame(data.drop('Timestamp', axis=1).iloc[-200:,:])
df_test.show(10)
+--------+--------+--------+--------+------------+-----------------+--------------+
| Open| High| Low| Close|Volume_(BTC)|Volume_(Currency)|Weighted_Price|
+--------+--------+--------+--------+------------+-----------------+--------------+
|10308.35|10308.38|10305.46|10305.48| 3.4655844| 35724.30454| 10308.306022|
|10308.29|10309.39|10300.02|10300.02| 0.60133309| 6199.0171228| 10308.790961|
|10303.66|10304.79|10303.66|10304.79| 0.07403171| 762.87531394| 10304.710157|
|10309.07|10309.07|10304.92|10304.92| 1.44460715| 14892.516672| 10309.042615|
|10304.97|10306.67|10304.97|10306.67| 0.07512171| 774.19907408| 10305.929858|
|10310.46|10310.46|10302.89|10306.96| 0.03689133| 380.30328001| 10308.744087|
|10309.15|10310.04|10303.36| 10309.5| 0.81774082| 8430.3061598| 10309.264199|
|10302.45|10302.45|10302.45|10302.45| 0.00554888| 57.167058756| 10302.45|
|10309.97|10312.56|10308.39|10308.94| 1.3198321| 13609.602688| 10311.616673|
|10312.09|10312.09|10312.09|10312.09| 0.01105789| 114.02995689| 10312.09|
+--------+--------+--------+--------+------------+-----------------+--------------+
only showing top 10 rows
@pandas_udf(returnType='predict float,label float',functionType=PandasUDFType.GROUPED_MAP)
def predict_udf(df):
#1.加载LSTM模型
lstm_model = tf.keras.models.load_model("./saved_model/")
#2.将df数据归一化
data=df.values
data=data.astype('float32')
mms = MinMaxScaler(feature_range=(0, 1))
data = mms.fit_transform(data)
#3.新建label列,取值为真实值
df["label"]= data[:,6]
#4.构建样本,并利用LSTM进行预测
shape=data.shape
sample=data.reshape(shape[0],1,shape[1])
df["predict"]=lstm_model.predict(sample)
return df.loc[:,["predict","label"]]
result_df = df_test.groupBy(F.spark_partition_id()).apply(predict_udf)
result_df.show(10)
+-----------+-----------+
| predict| label|
+-----------+-----------+
|0.088662505| 0.08944702|
| 0.13751054|0.103881836|
| 0.05792491| 0.11117554|
| 0.09696321| 0.0|
| 0.37479195| 0.41714478|
| 0.5565088| 0.66851807|
| 0.57387674| 0.7853699|
| 0.58684516| 0.7824402|
| 0.5714736| 0.76208496|
| 0.6371473| 0.73916626|
+-----------+-----------+
only showing top 10 rows
Linux看这里:http://blog.csdn.net/qq_15773669/article/details/69062374 如何打开ipynb文件 通过安装anaconda anaconda包含大量的科学计算包,其中就包含ipython 和jupyter,安装之后即可用其打开ipython文件.安装方式为: 官网下载相应python版本的安装文件,我这里下载的是python2...
目录 1、安装ipython 和jupyter 2、使用 1、安装ipython 和jupyter 命令如下: 2、使用 接下来在需要查看的ipynb文件目录上,终端中输入: 即可在浏览器中打开当前目录。 ...
在ipynb文件最后一行加入此代码,同时保证文件名和ipynb文件的文件名一致。...
第一步,安装Python2.7和pip sudo apt-get update 安装python sudo apt-get -y install python2.7 python-pip python-dev 确认python版本 python --version,输入后该命令后,我的终端输出Python 2.7.12 确认pip安装,输入:pip --version, 我的终端显示:pip 8....
注意:导出接口是Get请求方式。 例:...
工具类: 实现类: 简简单单 ...
2019独角兽企业重金招聘Python工程师标准>>> 1. 安装VSCode插件 打开VSCode,按F1 输入ext install,选择安装扩展 搜素markdown pdf,安装插件 2. 编辑.md文件 后缀名为.md 3. 导出pdf 文件中点击鼠标右键,选择Markdown PDF:Export(pdf) 在文件的原目录下就会产生编译后的pdf文件 转载于:http...
.ipynb文件转.py文件 然后输入命令: jupyter nbconvert --to script demo.ipynb 即可...
一、转换为html文件,在相应文件夹里,进入对应的conda环境,然后输入命令 里面notebook.ipynb是你需要转换的文件名字,你相应的更改文件名字,例如我的是test.ipynb,那就是 二、转换为markdown文件 其实和上面一样就是一行代码的事 三、转换为pdf文件 这个如果你直接输入和上面差不多的代码,很可能报错,例如 该错误提示没有安装xelatex。所以,我们需要提前安装xe...
题目: 数组中有一个数字出现的次数超过数组长度的一半,请找出这个数字。例如输入一个长度为9的数组{1,2,3,2,2,2,5,4,2}。由于数字2在数组中出现了5次,超过数组长度的一半,因此输出2。如果不存在则输出0。 思路: ①作一个判断条件。 ②作俩个for循环,用每一个值依次和后面的值作比较。 ③当这俩个值相等时,计数加一。 ④当计数值大于数组长度的一半时,输出这个值就好了。 这里要注意in...