# coding=utf-8
import codecs
import csv
import numpy as np
import sys
from sklearn import svm
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV
from sklearn.preprocessing import StandardScaler
SOURCE = "source.csv"
PREDICT = "predict.csv"
TEST_RATE = 0.1
features = []
labels = []
def build_dataset():
source_file = codecs.open(SOURCE, 'r', 'utf-8')
reader = csv.reader(source_file)
for index, row in enumerate(reader):
if not row[0].isdigit():
continue
data_class = 0 if row[2] == u"在线" else 1
data = row[4:]
data = [float(i) for i in data]
features.append(data)
labels.append(data_class)
source_file.close()
def run():
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)
c_range = np.logspace(-3, 3, 7)
gamma_range = np.logspace(-3, 3, 7)
param_grid = dict(gamma=gamma_range, C=c_range)
svc = svm.SVC(kernel='rbf', probability=True,
class_weight='balanced', cache_size=512)
cv = StratifiedShuffleSplit(n_splits=4, test_size=0.1, random_state=42)
grid = GridSearchCV(svc, param_grid=param_grid, cv=cv, n_jobs=4)
grid.fit(features_scaled, labels)
predict = grid.predict(features_scaled)
predict_prob = grid.predict_proba(features_scaled)
predict_file = codecs.open(PREDICT, 'w', 'utf-8')
for index, feature in enumerate(features):
prob = predict_prob[index][0] \
if predict[index] == 1 \
else predict_prob[index][1]
predict_file.write(",".join([str(i) for i in feature]) + "," +
str(labels[index]) + "," +
str(predict[index]) + "," +
str(prob) + "\n")
predict_file.close()
score = grid.best_score_
print ("Score:" + str(score))
if __name__ == "__main__":
reload(sys)
sys.setdefaultencoding('utf-8')
build_dataset()
run()