pyspark之LogisticRegression算法

最新推荐文章于 2026-06-17 16:07:05 发布

原创最新推荐文章于 2026-06-17 16:07:05 发布 · 813 阅读

0 ·

本内容遵循CC 4.0 BY-SA版权协议

标签

#pyspark #Logistics

虚拟机+大数据专栏收录该内容

9 篇文章

订阅专栏

本文深入探讨了如何在PySpark中使用LogisticRegression算法进行数据分析和预测。通过实例，详细解释了模型构建、参数调整及结果评估的过程，为理解和应用Logistic Regression提供了实用指南。

import sys
import time
import pandas as pd
import matplotlib.pyplot as plt
from pyspark import SparkConf,SparkContext
from pyspark.mllib.classification import LogisticRegressionWithSGD
from pyspark.mllib.regression import LabeledPoint
import numpy as np
from pyspark.mllib.evaluation import BinaryClassificationMetrics
from pyspark.mllib.feature import StandardScaler

def extract_features(fleld,catedoriesMap,featureEnd):
    categoryidx = catedoriesMap[fleld[3]]
    categoryfeatures = np.zeros(len(catedoriesMap))
    categoryfeatures[categoryidx] = 1
    numericalFeatures=[convert_float(fleld) for fleld in fleld[4:featureEnd]]
    return np.concatenate((categoryfeatures,numericalFeatures))

def extract_label(field):
    label = field[-1]
    return float(label)
def convert_float(x):
    return (0 if x=="?" else float(x))

global Path
if sc.master[0:5]=='local':
    Path='file:/home/swt/pythonwork/PythonProject/'
else:
    Path="hdfs://localhost:9000/user/swt/"

# def prepare_data(sc):
print('load data...')
rawDataWithHeader = sc.textFile(Path+'data/train.tsv')
header = rawDataWithHeader.first()
rawData = rawDataWithHeader.filter(lambda x:x != header)
rData = rawData.map(lambda x:x.replace("\"",""))
lines = rData.map(lambda x:x.split("\t"))
print("is "+str(lines.count()))

# 取出label值
categoriesMap = lines.map(lambda fields:fields[3]).distinct().zipWithIndex().collectAsMap()
labelRDD = lines.map(lambda r:extract_label(r))
print(labelRDD.take(3))

[0.0, 1.0, 1.0]

# 取出feature数据
featureRDD = lines.map(lambda r:extract_features(r,categoriesMap,len(r)-1))

# 将数据标准化
stdScaler = StandardScaler(withMean=True,withStd=True).fit(featureRDD)
ScalerFeatureRDD = stdScaler.transform(featureRDD)

# 将label和feature整合在一起
labelpoint = labelRDD.zip(ScalerFeatureRDD)
print('label',labelpoint.take(2))

# 从这看出它还是DenseVector格式
label [(0.0, DenseVector([2.7207, -0.2327, -0.6808, -0.3818, -0.1019, -0.2205, -0.2042, -0.0649, -0.0991, -0.0233, -0.0285, -0.4464, -0.271, -0.2017, 1.1376, -0.0819, 1.0251, -0.0559, -0.4689, -0.3543, -0.3175, 0.3385, 0.0, 0.8288, -0.1473, 0.2296, -0.1416, 0.7902, 0.7172, -0.298, -0.2035, -0.033, -0.0488, 0.9401, -0.1087, -0.2788])), (1.0, DenseVector([-0.3675, -0.2327, -0.6808, -0.3818, -0.1019, -0.2205, -0.2042, -0.0649, -0.0991, -0.0233, -0.0285, 2.2397, -0.271, -0.2017, 0.4887, 0.1063, 0.1959, 0.509, 1.2695, 1.3097, -0.3132, 0.3385, 0.0, 1.0202, -0.1473, -0.5771, -0.0975, 0.7902, 0.7172, 0.4866, -0.2035, -0.0838, 0.0459, 1.2494, 0.0489, 0.3058]))]

# 转换成LabeledPoint形式
labelpointRDD = labelpoint.map(lambda r:LabeledPoint(r[0],r[1]))
print('labelRDD',labelpointRDD.take(3))

# 转换成了LabeledPoint格式
labelRDD [LabeledPoint(0.0, [2.7207366564548514,-0.23272797709480803,-0.6807527904251456,-0.38181322324318134,-0.10189469097220732,-0.22052688457880879,-0.20418221057887365,-0.06487757239262681,-0.09914991930875496,-0.02326210589837061,-0.028494000387023734,-0.4464212047941535,-0.2709990696925828,-0.2016540523193296,1.137647336497678,-0.08193557169294771,1.0251398128933331,-0.05586356442541689,-0.4688932531289357,-0.3543053263079386,-0.3175352172363148,0.3384507982396541,0.0,0.828822173315322,-0.14726894334628504,0.22963982357813484,-0.14162596909880876,0.7902380499177364,0.7171947294529865,-0.29799681649642257,-0.2034625779299476,-0.03296720969690391,-0.04878112975579913,0.9400699751165439,-0.10869848852526258,-0.2788207823137022]), LabeledPoint(1.0, [-0.3674978139186906,-0.23272797709480803,-0.6807527904251456,-0.38181322324318134,-0.10189469097220732,-0.22052688457880879,-0.20418221057887365,-0.06487757239262681,-0.09914991930875496,-0.02326210589837061,-0.028494000387023734,2.2397340510665176,-0.2709990696925828,-0.2016540523193296,0.4886859904169113,0.10628363705145247,0.19588566290866805,0.5089868068250981,1.2694691632834691,1.3097138984590067,-0.31317609057749013,0.3384507982396541,0.0,1.020243830531209,-0.14726894334628504,-0.5770724205625781,-0.09745981080144801,0.7902380499177364,0.7171947294529865,0.4865822517691842,-0.2034625779299476,-0.08378163520013758,0.04594422902162049,1.2493695598285408,0.04885342046314602,0.3057802219012584]), LabeledPoint(1.0, [-0.3674978139186906,-0.23272797709480803,-0.6807527904251456,-0.38181322324318134,-0.10189469097220732,-0.22052688457880879,-0.20418221057887365,-0.06487757239262681,-0.09914991930875496,-0.02326210589837061,-0.028494000387023734,-0.4464212047941535,3.6895505753205593,-0.2016540523193296,1.7637001514533053,-0.04396165033238008,0.46169187416320356,0.7334297297958666,0.29269849146593974,-0.09123800981506591,-0.3032188826367953,0.3384507982396541,0.0,0.3866538001107552,-0.14726894334628504,-0.14053895060796384,-0.08084807670648379,0.7902380499177364,0.7171947294529865,1.2221251282681904,-0.2034625779299476,-0.39171029351574205,0.4415619039155495,1.8679687292525349,-0.03381270192906146,-0.5503866803183464])]

# 将数据切分成训练集，验证集，测试集
(trainData,validationData,testData) = labelpointRDD.randomSplit([8,1,1])

# 持久化
trainData.persist()
validationData.persist()
testData.persist()

# 开始训练
start_time = time.time()
model = LogisticRegressionWithSGD.train(trainData,15,10,0.5)
score=model.predict(validationData.map(lambda p:p.features))
score = score.map(lambda x:float(x))

scoreAndLabels=score.zip(validationData.map(lambda p:p.label))

# 求AUC指数
from pyspark.mllib.evaluation import BinaryClassificationMetrics
metrics = BinaryClassificationMetrics(scoreAndLabels)
AUC = metrics.areaUnderROC
print('auc',AUC)

auc 0.6603715728715729