pyspark之LogisticRegression算法

本文深入探讨了如何在PySpark中使用LogisticRegression算法进行数据分析和预测。通过实例,详细解释了模型构建、参数调整及结果评估的过程,为理解和应用Logistic Regression提供了实用指南。
import sys
import time
import pandas as pd
import matplotlib.pyplot as plt
from pyspark import SparkConf,SparkContext
from pyspark.mllib.classification import LogisticRegressionWithSGD
from pyspark.mllib.regression import LabeledPoint
import numpy as np
from pyspark.mllib.evaluation import BinaryClassificationMetrics
from pyspark.mllib.feature import StandardScaler

def extract_features(fleld,catedoriesMap,featureEnd):
    categoryidx = catedoriesMap[fleld[3]]
    categoryfeatures = np.zeros(len(catedoriesMap))
    categoryfeatures[categoryidx] = 1
    numericalFeatures=[convert_float(fleld) for fleld in fleld[4:featureEnd]]
    return np.concatenate((categoryfeatures,numericalFeatures))

def extract_label(field):
    label = field[-1]
    return float(label)
def convert_float(x):
    return (0 if x=="?" else float(x))

global Path
if sc.master[0:5]=='local':
    Path='file:/home/swt/pythonwork/PythonProject/'
else:
    Path="hdfs://localhost:9000/user/swt/"

# def prepare_data(sc):
print('load data...')
rawDataWithHeader = sc.textFile(Path+'data/train.tsv')
header = rawDataWithHeader.first()
rawData = rawDataWithHeader.filter(lambda x:x != header)
rData = rawData.map(lambda x:x.replace("\"",""))
lines = rData.map(lambda x:x.split("\t"))
print("is "+str(lines.count()))

# 取出label值
categoriesMap = lines.map(lambda fields:fields[3]).distinct().zipWithIndex().collectAsMap()
labelRDD = lines.map(lambda r:extract_label(r))
print(labelRDD.take(3))

[0.0, 1.0, 1.0]
# 取出feature数据
featureRDD = lines.map(lambda r:extract_features(r,categoriesMap,len(r)-1))

# 将数据标准化
stdScaler = StandardScaler(withMean=True,withStd=True).fit(featureRDD)
ScalerFeatureRDD = stdScaler.transform(featureRDD)
# 将label和feature整合在一起
labelpoint = labelRDD.zip(ScalerFeatureRDD)
print('label',labelpoint.take(2))
# 从这看出它还是DenseVector格式
label [(0.0, DenseVector([2.7207, -0.2327, -0.6808, -0.3818, -0.1019, -0.2205, -0.2042, -0.0649, -0.0991, -0.0233, -0.0285, -0.4464, -0.271, -0.2017, 1.1376, -0.0819, 1.0251, -0.0559, -0.4689, -0.3543, -0.3175, 0.3385, 0.0, 0.8288, -0.1473, 0.2296, -0.1416, 0.7902, 0.7172, -0.298, -0.2035, -0.033, -0.0488, 0.9401, -0.1087, -0.2788])), (1.0, DenseVector([-0.3675, -0.2327, -0.6808, -0.3818, -0.1019, -0.2205, -0.2042, -0.0649, -0.0991, -0.0233, -0.0285, 2.2397, -0.271, -0.2017, 0.4887, 0.1063, 0.1959, 0.509, 1.2695, 1.3097, -0.3132, 0.3385, 0.0, 1.0202, -0.1473, -0.5771, -0.0975, 0.7902, 0.7172, 0.4866, -0.2035, -0.0838, 0.0459, 1.2494, 0.0489, 0.3058]))]
# 转换成LabeledPoint形式
labelpointRDD = labelpoint.map(lambda r:LabeledPoint(r[0],r[1]))
print('labelRDD',labelpointRDD.take(3))
# 转换成了LabeledPoint格式
labelRDD [LabeledPoint(0.0, [2.7207366564548514,-0.23272797709480803,-0.6807527904251456,-0.38181322324318134,-0.10189469097220732,-0.22052688457880879,-0.20418221057887365,-0.06487757239262681,-0.09914991930875496,-0.02326210589837061,-0.028494000387023734,-0.4464212047941535,-0.2709990696925828,-0.2016540523193296,1.137647336497678,-0.08193557169294771,1.0251398128933331,-0.05586356442541689,-0.4688932531289357,-0.3543053263079386,-0.3175352172363148,0.3384507982396541,0.0,0.828822173315322,-0.14726894334628504,0.22963982357813484,-0.14162596909880876,0.7902380499177364,0.7171947294529865,-0.29799681649642257,-0.2034625779299476,-0.03296720969690391,-0.04878112975579913,0.9400699751165439,-0.10869848852526258,-0.2788207823137022]), LabeledPoint(1.0, [-0.3674978139186906,-0.23272797709480803,-0.6807527904251456,-0.38181322324318134,-0.10189469097220732,-0.22052688457880879,-0.20418221057887365,-0.06487757239262681,-0.09914991930875496,-0.02326210589837061,-0.028494000387023734,2.2397340510665176,-0.2709990696925828,-0.2016540523193296,0.4886859904169113,0.10628363705145247,0.19588566290866805,0.5089868068250981,1.2694691632834691,1.3097138984590067,-0.31317609057749013,0.3384507982396541,0.0,1.020243830531209,-0.14726894334628504,-0.5770724205625781,-0.09745981080144801,0.7902380499177364,0.7171947294529865,0.4865822517691842,-0.2034625779299476,-0.08378163520013758,0.04594422902162049,1.2493695598285408,0.04885342046314602,0.3057802219012584]), LabeledPoint(1.0, [-0.3674978139186906,-0.23272797709480803,-0.6807527904251456,-0.38181322324318134,-0.10189469097220732,-0.22052688457880879,-0.20418221057887365,-0.06487757239262681,-0.09914991930875496,-0.02326210589837061,-0.028494000387023734,-0.4464212047941535,3.6895505753205593,-0.2016540523193296,1.7637001514533053,-0.04396165033238008,0.46169187416320356,0.7334297297958666,0.29269849146593974,-0.09123800981506591,-0.3032188826367953,0.3384507982396541,0.0,0.3866538001107552,-0.14726894334628504,-0.14053895060796384,-0.08084807670648379,0.7902380499177364,0.7171947294529865,1.2221251282681904,-0.2034625779299476,-0.39171029351574205,0.4415619039155495,1.8679687292525349,-0.03381270192906146,-0.5503866803183464])]
# 将数据切分成训练集,验证集,测试集
(trainData,validationData,testData) = labelpointRDD.randomSplit([8,1,1])

# 持久化
trainData.persist()
validationData.persist()
testData.persist()

# 开始训练
start_time = time.time()
model = LogisticRegressionWithSGD.train(trainData,15,10,0.5)
score=model.predict(validationData.map(lambda p:p.features))
score = score.map(lambda x:float(x))

scoreAndLabels=score.zip(validationData.map(lambda p:p.label))

# 求AUC指数
from pyspark.mllib.evaluation import BinaryClassificationMetrics
metrics = BinaryClassificationMetrics(scoreAndLabels)
AUC = metrics.areaUnderROC
print('auc',AUC)
auc 0.6603715728715729
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值