Hỏi đáp

Chia sẻ kiến thức, cùng nhau phát triển

thay đổi phần [Test] nhưng vẫn giữ nguyên phần [Train]

00:02 22-11-2017 647 lượt xem 3 bình luận 04:02 22-11-2017
import os, sys
import nltk
import numpy as np
import matplotlib.pyplot as plt

DJANGO_PATH_TOKENIZER = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'VnTokenizer/scripts')
sys.path.append(DJANGO_PATH_TOKENIZER)


from nltk.stem import WordNetLemmatizer
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split
from bs4 import BeautifulSoup
from vn_tokenizer import runTokenizer
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix

wordnet_lemmatizer = WordNetLemmatizer()


stopwords = set(w.rstrip() for w in open('stopwords.txt'))

# load the reviews
# data courtesy of http://www.cs.jhu.edu/~mdredze/datasets/sentiment/index2.html
#positive_reviews = BeautifulSoup(open('hotel/positive.review').read(),"lxml")

with open('hotel/positive.review', encoding='utf8') as infile:
    positive_reviews = BeautifulSoup(infile, "html.parser")

positive_reviews = positive_reviews.findAll('review_text')

#negative_reviews = BeautifulSoup(open('hotel/negative.review').read(),"lxml")

with open('hotel/negative.review', encoding='utf8') as infile:
    negative_reviews = BeautifulSoup(infile, "html.parser")

negative_reviews = negative_reviews.findAll('review_text')

# there are more positive reviews than negative reviews
# so let's take a random sample so we have balanced classes
np.random.shuffle(positive_reviews)
positive_reviews = positive_reviews[:len(negative_reviews)]

def my_tokenizer(s):
    s = s.lower() # downcase
    #tokens = nltk.tokenize.word_tokenize(s) # split string into words (tokens)
    #print(tokens)
    tokens = runTokenizer(s)
    tokens = [t for t in tokens if len(t) > 1] # remove short words, they're probably not useful
    #tokens = [wordnet_lemmatizer.lemmatize(t) for t in tokens] # put words into base form
    tokens = [t for t in tokens if t not in stopwords] # remove stopwords
    return tokens

# create a word-to-index map so that we can create our word-frequency vectors later
# let's also save the tokenized versions so we don't have to tokenize again later
word_index_map = {}
current_index = 0
positive_tokenized = []
negative_tokenized = []

for review in positive_reviews:
    #print (review.text)
    tokens = my_tokenizer(review.text)
    print(tokens)
    positive_tokenized.append(tokens)
    for token in tokens:
        if token not in word_index_map:
            word_index_map[token] = current_index
            current_index += 1

for review in negative_reviews:
    tokens = my_tokenizer(review.text)
    negative_tokenized.append(tokens)
    for token in tokens:
        if token not in word_index_map:
            word_index_map[token] = current_index
            current_index += 1


# now let's create our input matrices
def tokens_to_vector(tokens, label):
    x = np.zeros(len(word_index_map) + 1) # last element is for the label
    for t in tokens:
        i = word_index_map[t]
        x[i] += 1
    x = x / x.sum() # normalize it before setting label
    x[-1] = label
    return x

N = len(positive_tokenized) + len(negative_tokenized)
# (N x D+1 matrix - keeping them together for now so we can shuffle more easily later
data = np.zeros((N, len(word_index_map) + 1))
i = 0
for tokens in positive_tokenized:
    xy = tokens_to_vector(tokens, 1)
    data[i,:] = xy
    i += 1

for tokens in negative_tokenized:
    xy = tokens_to_vector(tokens, 0)
    data[i,:] = xy
    i += 1

# shuffle the data and create train/test splits
# try it multiple times!
np.random.shuffle(data)
data = data[~np.isnan(data).any(axis=1)]
#X = data[:,:-1]
#Y = data[:,-1]

#Xtrain = X[:,]
#Ytrain = Y[:,]
#Xtest = X[:,]
#Ytest = Y[:,]

model = LogisticRegression()
model.fit(Xtrain, Ytrain)
y_pred = model.predict(Xtest)
print (f1_score(Ytest, y_pred, average="macro"))
print (precision_score(Ytest, y_pred, average="macro"))
print (recall_score(Ytest, y_pred, average="macro"))
print (accuracy_score(Ytest,y_pred,normalize=True,sample_weight=None ))

Mình có file text gồm 100 dòng để làm dữ liệu cho phần test

Mình muốn thay thế phần test trong source bằng phần test của mình thì phải làm thế nào 

Bình luận

Để bình luận, bạn cần đăng nhập bằng tài khoản Howkteam.

Đăng nhập
lehongtuan261295 đã bình luận 04:02 22-11-2017

Mình có nghĩ ra 1 số cách thay thế cái Xtest như đều bị lỗi x has 785 features per sample; expecting 9571. Mình k biết xử lí ntn nữa

K9 SuperAdmin, KquizAdmin, KquizAuthor đã bình luận 00:04 22-11-2017

ý bạn là thao tác tay hay code? tay thì coppy paste. câu hỏi của bạn còn chưa rõ ràng

Câu hỏi mới nhất