第4回の練習問題の解答例¶

ここまで扱ってきたデータの英語学習者に関して部分的に評価値が付与されています。評価値は"../DATA02/nice_evaluation.csv"に保存されています。評価された作文は"../DATA02/NICE_NNS2"に保存されています。ここで学んだ同様の手順でこのデータを自動採点するシステムを構築し、交差検証を行いなさい。

In [15]:
import os
import pandas as pd
from nltk import sent_tokenize
import spacy
from spacy import displacy
nlp = spacy.load("en_core_web_sm")
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
import matplotlib.pyplot as plt
%matplotlib inline
In [3]:
fnames = os.listdir("../DATA02/NICE_NNS2/")

## ファイル名をソート ##
fnames.sort()

T = []

for i in fnames:
    f = open("../DATA02/NICE_NNS2/"+i,"r")
    text = f.read()
    f.close()
    T.append(text)
In [6]:
E = pd.read_csv("../DATA02/nice_evaluation.csv",index_col=0)
Y = list(E["score"])
In [11]:
CLAUSE = ["advcl","relcl","ccomp","xcomp","acl","csubj"]

CpS = []

for i in T:
    c = 0
    sents = sent_tokenize(i)
    s = len(sents)
    for j in sents:
        doc = nlp(j)
        for token in doc:
            if token.dep_ in CLAUSE:
                c +=1
    CpS.append(c/s)
    
In [12]:
NL = []

for i in T:
    tmp = []
    sents = sent_tokenize(i)
    for j in sents:
        doc = nlp(j)
        for chunk in doc.noun_chunks:
            tmp.append(len(chunk.text.split()))
    NL.append(sum(tmp)/len(tmp))
In [14]:
X = []

for i,j in zip(CpS,NL):
    X.append([i,j])
In [16]:
N = []
S = []
X_train,X_test,y_train,y_test = train_test_split(X,Y,test_size=0.2)
for i in range(1,31):
    
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_train,y_train)
    score = knn.score(X_test,y_test)
    N.append(i)
    S.append(score)

plt.xlabel("number_of_k")
plt.ylabel("accuracy")
plt.plot(N,S)
Out[16]:
[<matplotlib.lines.Line2D at 0x283d0a130>]
No description has been provided for this image