樸素貝葉斯/SVM文字分類

青空梔淺發表於2018-10-25
import jieba
import pandas as pd
df_technology = pd.read_csv("./data/technology_news.csv", encoding=`utf-8`)
df_technology = df_technology.dropna()

df_car = pd.read_csv("./data/car_news.csv", encoding=`utf-8`)
df_car = df_car.dropna()

df_entertainment = pd.read_csv("./data/entertainment_news.csv", encoding=`utf-8`)
df_entertainment = df_entertainment.dropna()

df_military = pd.read_csv("./data/military_news.csv", encoding=`utf-8`)
df_military = df_military.dropna()

df_sports = pd.read_csv("./data/sports_news.csv", encoding=`utf-8`)
df_sports = df_sports.dropna()

technology = df_technology.content.values.tolist()[1000:21000]
car = df_car.content.values.tolist()[1000:21000]
entertainment = df_entertainment.content.values.tolist()[:20000]
military = df_military.content.values.tolist()[:20000]
sports = df_sports.content.values.tolist()[:20000]
stopwords=pd.read_csv("data/stopwords.txt",index_col=False,quoting=3,sep="	",names=[`stopword`], encoding=`utf-8`)
stopwords=stopwords[`stopword`].values
def preprocess_text(content_lines, sentences, category):
    for line in content_lines:
        try:
            segs=jieba.lcut(line)
            segs = filter(lambda x:len(x)>1, segs)
            segs = filter(lambda x:x not in stopwords, segs)
            sentences.append((" ".join(segs), category))
        except Exception,e:
            print line
            continue 

#生成訓練資料
sentences = []

preprocess_text(technology, sentences, `technology`)
preprocess_text(car, sentences, `car`)
preprocess_text(entertainment, sentences, `entertainment`)
preprocess_text(military, sentences, `military`)
preprocess_text(sports, sentences, `sports`)
import random
random.shuffle(sentences)
for sentence in sentences[:10]:
    print sentence[0], sentence[1]
    
from sklearn.model_selection import train_test_split
x, y = zip(*sentences)
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=1234)

from sklearn.feature_extraction.text import CountVectorizer

vec = CountVectorizer(
    analyzer=`word`, # tokenise by character ngrams
    max_features=4000,  # keep the most common 1000 ngrams
)
vec.fit(x_train)

def get_features(x):
    vec.transform(x)
import re

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB


class TextClassifier():

    def __init__(self, classifier=MultinomialNB()):
        self.classifier = classifier
        self.vectorizer = CountVectorizer(analyzer=`word`, ngram_range=(1,4), max_features=20000)

    def features(self, X):
        return self.vectorizer.transform(X)

    def fit(self, X, y):
        self.vectorizer.fit(X)
        self.classifier.fit(self.features(X), y)

    def predict(self, x):
        return self.classifier.predict(self.features([x]))

    def score(self, X, y):
        return self.classifier.score(self.features(X), y)
text_classifier = TextClassifier()
text_classifier.fit(x_train, y_train)
print(text_classifier.predict(`這 是 有史以來 最 大 的 一 次 軍艦 演習`))
print(text_classifier.score(x_test, y_test))
from sklearn.svm import SVC
svm = SVC(kernel=`linear`)
svm.fit(vec.transform(x_train), y_train)
svm.score(vec.transform(x_test), y_test)
import re

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC


class TextClassifier():

    def __init__(self, classifier=SVC(kernel=`linear`)):
        self.classifier = classifier
        self.vectorizer = TfidfVectorizer(analyzer=`word`, ngram_range=(1,3), max_features=12000)

    def features(self, X):
        return self.vectorizer.transform(X)

    def fit(self, X, y):
        self.vectorizer.fit(X)
        self.classifier.fit(self.features(X), y)

    def predict(self, x):
        return self.classifier.predict(self.features([x]))

    def score(self, X, y):
        return self.classifier.score(self.features(X), y)

相關文章