Hỏi đáp
Chia sẻ kiến thức, cùng nhau phát triển
thay đổi phần [Test] nhưng vẫn giữ nguyên phần [Train]
00:02 22-11-2017
647 lượt xem
3 bình luận
04:02 22-11-2017
import os, sys
import nltk
import numpy as np
import matplotlib.pyplot as plt
DJANGO_PATH_TOKENIZER = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'VnTokenizer/scripts')
sys.path.append(DJANGO_PATH_TOKENIZER)
from nltk.stem import WordNetLemmatizer
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split
from bs4 import BeautifulSoup
from vn_tokenizer import runTokenizer
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
wordnet_lemmatizer = WordNetLemmatizer()
stopwords = set(w.rstrip() for w in open('stopwords.txt'))
# load the reviews
# data courtesy of http://www.cs.jhu.edu/~mdredze/datasets/sentiment/index2.html
#positive_reviews = BeautifulSoup(open('hotel/positive.review').read(),"lxml")
with open('hotel/positive.review', encoding='utf8') as infile:
positive_reviews = BeautifulSoup(infile, "html.parser")
positive_reviews = positive_reviews.findAll('review_text')
#negative_reviews = BeautifulSoup(open('hotel/negative.review').read(),"lxml")
with open('hotel/negative.review', encoding='utf8') as infile:
negative_reviews = BeautifulSoup(infile, "html.parser")
negative_reviews = negative_reviews.findAll('review_text')
# there are more positive reviews than negative reviews
# so let's take a random sample so we have balanced classes
np.random.shuffle(positive_reviews)
positive_reviews = positive_reviews[:len(negative_reviews)]
def my_tokenizer(s):
s = s.lower() # downcase
#tokens = nltk.tokenize.word_tokenize(s) # split string into words (tokens)
#print(tokens)
tokens = runTokenizer(s)
tokens = [t for t in tokens if len(t) > 1] # remove short words, they're probably not useful
#tokens = [wordnet_lemmatizer.lemmatize(t) for t in tokens] # put words into base form
tokens = [t for t in tokens if t not in stopwords] # remove stopwords
return tokens
# create a word-to-index map so that we can create our word-frequency vectors later
# let's also save the tokenized versions so we don't have to tokenize again later
word_index_map = {}
current_index = 0
positive_tokenized = []
negative_tokenized = []
for review in positive_reviews:
#print (review.text)
tokens = my_tokenizer(review.text)
print(tokens)
positive_tokenized.append(tokens)
for token in tokens:
if token not in word_index_map:
word_index_map[token] = current_index
current_index += 1
for review in negative_reviews:
tokens = my_tokenizer(review.text)
negative_tokenized.append(tokens)
for token in tokens:
if token not in word_index_map:
word_index_map[token] = current_index
current_index += 1
# now let's create our input matrices
def tokens_to_vector(tokens, label):
x = np.zeros(len(word_index_map) + 1) # last element is for the label
for t in tokens:
i = word_index_map[t]
x[i] += 1
x = x / x.sum() # normalize it before setting label
x[-1] = label
return x
N = len(positive_tokenized) + len(negative_tokenized)
# (N x D+1 matrix - keeping them together for now so we can shuffle more easily later
data = np.zeros((N, len(word_index_map) + 1))
i = 0
for tokens in positive_tokenized:
xy = tokens_to_vector(tokens, 1)
data[i,:] = xy
i += 1
for tokens in negative_tokenized:
xy = tokens_to_vector(tokens, 0)
data[i,:] = xy
i += 1
# shuffle the data and create train/test splits
# try it multiple times!
np.random.shuffle(data)
data = data[~np.isnan(data).any(axis=1)]
#X = data[:,:-1]
#Y = data[:,-1]
#Xtrain = X[:,]
#Ytrain = Y[:,]
#Xtest = X[:,]
#Ytest = Y[:,]
model = LogisticRegression()
model.fit(Xtrain, Ytrain)
y_pred = model.predict(Xtest)
print (f1_score(Ytest, y_pred, average="macro"))
print (precision_score(Ytest, y_pred, average="macro"))
print (recall_score(Ytest, y_pred, average="macro"))
print (accuracy_score(Ytest,y_pred,normalize=True,sample_weight=None ))
Mình có file text gồm 100 dòng để làm dữ liệu cho phần test
Mình muốn thay thế phần test trong source bằng phần test của mình thì phải làm thế nào
Mình có nghĩ ra 1 số cách thay thế cái Xtest như đều bị lỗi x has 785 features per sample; expecting 9571. Mình k biết xử lí ntn nữa
ý bạn là thao tác tay hay code? tay thì coppy paste. câu hỏi của bạn còn chưa rõ ràng