缺失值处理
import pandas as pdaimport numpy as npyimport matplotlib.pylab as pyl# data=pda.read_excel("D:/taobao2.xls")def index(data): data = pda.DataFrame(data[1:],columns=data[0]) print(data) data["价格"][(data["价格"]==0)]=None print(data) x=0 for i in data.columns: for j in range(len(data)): if(data[i].isnull())[j]: data[i][j]=data["价格"].mean() x+=1 print(x) if __name__ == "__main__": data = nosupervision_read_data() index(data)
数据离散化处理
#离散化#连续型数据离散化#等宽离散化import pandas as pdaimport numpy as npyimport matplotlib.pylab as pyl# data=pda.read_excel("D:/taobao2.xls")def index(data): data = pda.DataFrame(data[1:], columns=data[0]) da=data.values price=da[:,2] price.sort() print(price) k=5 c1=pda.cut(price,k,labels=["太便宜","便宜","适中","贵","太贵"]) print(c1)#指点区间离散化 k=[0,50,100,price.max()] print(k) c2=pda.cut(price,k,labels=["非常便宜","适中","贵"]) print(c2)if __name__ == "__main__": data = nosupervision_read_data() index(data)
数据集成处理
# -*- coding:utf-8 -*-# 异常值处理import pandas as pdaimport numpy as npydef index(data):# 输出结果必须为字典output output = {}# data = pda.read_excel("D:/taobao2.xls") data = pda.DataFrame(data[1:], columns=data[0])# print(data) da = data.values# 数据集成 da1 = da[0:10] da2 = da[10:20] da3 = npy.concatenate((da1, da2)) pda.DataFrame(da3) output['data_数据集成'] = pda.DataFrame(da3).values.tolist() print(pda.DataFrame(da1)) print(pda.DataFrame(da2)) print(pda.DataFrame(da3)) print(output) return outputif __name__ == "__main__": data = nosupervision_read_data() index(data)