第5回の練習問題の解答例¶

ここまで扱ってきたデータの英語学習者に対して、部分的に評価値が付与されています。評価値は"../DATA02/nice_evaluation.csv"に保存されています。評価された作文は"../DATA02/NICE_NNS2"に保存されています。品詞の頻度、依存関係の頻度、語彙の多様性、語彙の洗練度に関する特徴量のひとつ(あるいは複数)を用いて自動採点システムを構築しなさい。

In [1]:
import os
import collections
import numpy as np
import pandas as pd
from nltk import sent_tokenize,word_tokenize
import spacy
from spacy import displacy
nlp = spacy.load("en_core_web_sm")
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
import matplotlib.pyplot as plt
%matplotlib inline
In [2]:
fnames = os.listdir("../DATA02/NICE_NNS2/")

## ファイル名をソート ##
fnames.sort()

T = []

for i in fnames:
    f = open("../DATA02/NICE_NNS2/"+i,"r")
    text = f.read()
    f.close()
    T.append(text)
In [3]:
E = pd.read_csv("../DATA02/nice_evaluation.csv",index_col=0)
Y = list(E["score"])
In [4]:
# 語数のカウント
WORDS = []

for i in T:
    w = word_tokenize(i)
    WORDS.append(len(w))
In [5]:
# 単語をPOSに変換
POS = []

for i in T:
    tmp = []
    doc = nlp(i)
    for token in doc:
        tmp.append(token.tag_)
    POS.append(tmp)
In [6]:
# POSの頻度
POS_FREQ = []

for i in POS:
    POS_FREQ.append(collections.Counter(i))
In [7]:
# POSの総タイプ
tag_types = []

for i in POS:
    for j in i:
        if j not in tag_types:
            tag_types.append(j)
In [8]:
# 頻度0のPOSを個々のディクショナリに追加
for i in POS_FREQ:
    k = i.keys()
    for j in tag_types:
        if j not in k:
            i[j] = 0
In [9]:
# 100語あたりの頻度に変換
for d,n in zip(POS_FREQ,WORDS):
    for k,v in d.items():
        d[k] = v * (100 / n)
In [10]:
# pandasのdataframeにするための処理
D = {}

for i in tag_types:
    D[i] = []
    
for i in POS_FREQ:
    for k,v in i.items():
        D[k].append(v)
In [11]:
df = pd.DataFrame(D,index=fnames)
df.head()
Out[11]:
DT JJ NNP IN , NN VBZ VBG RB . ... PDT WDT HYPH RBR JJS SYM NNPS UH WP$ NFP
JPN002.txt 7.908163 10.204082 2.040816 11.479592 7.142857 18.877551 3.826531 1.785714 8.163265 6.377551 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.0 0.0 0.0 0.0 0.0
JPN004.txt 3.819444 7.638889 0.694444 10.069444 4.861111 25.347222 3.125000 0.347222 2.777778 5.902778 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.0 0.0 0.0 0.0 0.0
JPN006.txt 8.759124 9.671533 2.737226 14.598540 3.832117 11.861314 2.007299 2.372263 3.467153 4.744526 ... 0.364964 0.182482 0.182482 0.364964 0.000000 0.0 0.0 0.0 0.0 0.0
JPN008.txt 3.313253 6.927711 1.204819 14.156627 1.506024 17.168675 3.614458 1.807229 3.915663 6.325301 ... 0.000000 0.602410 0.000000 0.000000 0.000000 0.0 0.0 0.0 0.0 0.0
JPN010.txt 8.951407 7.928389 1.278772 12.531969 3.324808 16.112532 3.580563 0.767263 5.882353 4.603581 ... 0.255754 0.767263 0.511509 0.000000 0.255754 0.0 0.0 0.0 0.0 0.0

5 rows × 45 columns

KNNを用いた自動採点

In [12]:
# リストに変換
X = df.values

N = []
S = []
X_train,X_test,y_train,y_test = train_test_split(X,Y,test_size=0.2)
for i in range(1,31):
    
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_train,y_train)
    score = knn.score(X_test,y_test)
    N.append(i)
    S.append(score)

plt.xlabel("number_of_k")
plt.ylabel("accuracy")
plt.plot(N,S)
Out[12]:
[<matplotlib.lines.Line2D at 0x157707670>]
No description has been provided for this image

ナイーブベイズ分類器を用いた自動s

In [13]:
N = []
S = []

for i in range(1,100):
    X_train,X_test,y_train,y_test = train_test_split(X,Y,test_size=0.2,random_state=np.random.randint(1000))
    nbc = GaussianNB()
    nbc.fit(X_train,y_train)
    S.append(nbc.score(X_test,y_test))
    N.append(i)
    

plt.ylabel("accuracy")
plt.plot(N,S)
Out[13]:
[<matplotlib.lines.Line2D at 0x157804940>]
No description has been provided for this image