import os
import collections
import numpy as np
import pandas as pd
from nltk import sent_tokenize,word_tokenize
import spacy
from spacy import displacy
nlp = spacy.load("en_core_web_sm")
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
import matplotlib.pyplot as plt
%matplotlib inline

fnames = os.listdir("../DATA02/NICE_NNS2/")

## ファイル名をソート ##
fnames.sort()

T = []

for i in fnames:
    f = open("../DATA02/NICE_NNS2/"+i,"r")
    text = f.read()
    f.close()
    T.append(text)

E = pd.read_csv("../DATA02/nice_evaluation.csv",index_col=0)
Y = list(E["score"])

# 語数のカウント
WORDS = []

for i in T:
    w = word_tokenize(i)
    WORDS.append(len(w))

# 単語をPOSに変換
POS = []

for i in T:
    tmp = []
    doc = nlp(i)
    for token in doc:
        tmp.append(token.tag_)
    POS.append(tmp)

# POSの頻度
POS_FREQ = []

for i in POS:
    POS_FREQ.append(collections.Counter(i))

# POSの総タイプ
tag_types = []

for i in POS:
    for j in i:
        if j not in tag_types:
            tag_types.append(j)

# 頻度0のPOSを個々のディクショナリに追加
for i in POS_FREQ:
    k = i.keys()
    for j in tag_types:
        if j not in k:
            i[j] = 0

# 100語あたりの頻度に変換
for d,n in zip(POS_FREQ,WORDS):
    for k,v in d.items():
        d[k] = v * (100 / n)

# pandasのdataframeにするための処理
D = {}

for i in tag_types:
    D[i] = []
    
for i in POS_FREQ:
    for k,v in i.items():
        D[k].append(v)

df = pd.DataFrame(D,index=fnames)
df.head()

# リストに変換
X = df.values

N = []
S = []
X_train,X_test,y_train,y_test = train_test_split(X,Y,test_size=0.2)
for i in range(1,31):
    
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_train,y_train)
    score = knn.score(X_test,y_test)
    N.append(i)
    S.append(score)

plt.xlabel("number_of_k")
plt.ylabel("accuracy")
plt.plot(N,S)

[<matplotlib.lines.Line2D at 0x157707670>]

N = []
S = []

for i in range(1,100):
    X_train,X_test,y_train,y_test = train_test_split(X,Y,test_size=0.2,random_state=np.random.randint(1000))
    nbc = GaussianNB()
    nbc.fit(X_train,y_train)
    S.append(nbc.score(X_test,y_test))
    N.append(i)
    

plt.ylabel("accuracy")
plt.plot(N,S)

[<matplotlib.lines.Line2D at 0x157804940>]

	DT	JJ	NNP	IN	,	NN	VBZ	VBG	RB	.	...	PDT	WDT	HYPH	RBR	JJS
JPN002.txt	7.908163	10.204082	2.040816	11.479592	7.142857	18.877551	3.826531	1.785714	8.163265	6.377551	...	0.000000	0.000000	0.000000	0.000000	0.000000
JPN004.txt	3.819444	7.638889	0.694444	10.069444	4.861111	25.347222	3.125000	0.347222	2.777778	5.902778	...	0.000000	0.000000	0.000000	0.000000	0.000000
JPN006.txt	8.759124	9.671533	2.737226	14.598540	3.832117	11.861314	2.007299	2.372263	3.467153	4.744526	...	0.364964	0.182482	0.182482	0.364964	0.000000
JPN008.txt	3.313253	6.927711	1.204819	14.156627	1.506024	17.168675	3.614458	1.807229	3.915663	6.325301	...	0.000000	0.602410	0.000000	0.000000	0.000000
JPN010.txt	8.951407	7.928389	1.278772	12.531969	3.324808	16.112532	3.580563	0.767263	5.882353	4.603581	...	0.255754	0.767263	0.511509	0.000000	0.255754

第5回の練習問題の解答例¶