import spacy
from spacy import displacy
nlp = spacy.load("en_core_web_sm")

# advclの例
doc = nlp("You can put the package wherever you like.")
displacy.render(doc)

# relclの例 
doc = nlp("My teacher, who came here in 1986, likes bananas.")
displacy.render(doc)

# ccompの例
doc = nlp("I know that he stole the chicken.")
displacy.render(doc)

# csubjの例
doc = nlp("What she said is interesting.")
displacy.render(doc)

# aclの例
doc = nlp("The girl dancing over there is my sister.")
displacy.render(doc)

# これらの節の数をカウントする
text = "You can put the package wherever you like. My teacher, who came here in 1986, likes bananas. I know that he stole the chicken. She asked George to respond her. What she said is interesting. The girl dancing over there is my sister."

CLAUSE = ["advcl","relcl","ccomp","csubj"]

n = 0

doc = nlp(text)

for sent in doc.sents:
    for token in sent:
        if token.dep_ in CLAUSE:
            n += 1

n

4

# 名詞句の例
doc = nlp("A tall man put his blue hat on an old table.")
displacy.render(doc)

# 名詞句のカウント

NOUNPHRASE = ["nsubj","dobj","pobj"]

doc = nlp("A tall man put his blue hat on an old table.")

n = 0
for token in doc:
    if token.dep_ in NOUNPHRASE:
        n += 1
        
n

3

N = []

for chunk in doc.noun_chunks:
    N.append(len(chunk.text.split()))
    
N

[3, 3, 3]

import os
# データの取得
fname_NS = os.listdir("../DATA02/NICE_NS/")

T_NS = []

for i in fname_NS:
    f = open("../DATA02/NICE_NS/"+i,"r")
    text = f.read()
    f.close()
    T_NS.append(text)
    
fname_NNS = os.listdir("../DATA02/NICE_NNS/")

T_NNS = []

for i in fname_NNS:
    f = open("../DATA02/NICE_NNS/"+i,"r")
    text = f.read()
    f.close()
    T_NNS.append(text)

# 正解ラベルの作成
Y = [0] * len(T_NNS) + [1] * len(T_NS)

# １文あたりの節（対象となっている）の出現頻度
from nltk import sent_tokenize

CLAUSE = ["advcl","relcl","ccomp","xcomp","acl","csubj"]

CpS_NNS = []

for i in T_NNS:
    c = 0
    sents = sent_tokenize(i)
    s = len(sents)
    for j in sents:
        doc = nlp(j)
        for token in doc:
            if token.dep_ in CLAUSE:
                c +=1
    CpS_NNS.append(c/s)
    
CpS_NS = []

for i in T_NS:
    c = 0
    sents = sent_tokenize(i)
    s = len(sents)
    for j in sents:
        doc = nlp(j)
        for token in doc:
            if token.dep_ in CLAUSE:
                c +=1
    CpS_NS.append(c/s)

NL_NNS = []

for i in T_NNS:
    tmp = []
    sents = sent_tokenize(i)
    for j in sents:
        doc = nlp(j)
        for chunk in doc.noun_chunks:
            tmp.append(len(chunk.text.split()))
    NL_NNS.append(sum(tmp)/len(tmp))
        
NL_NS = []

for i in T_NS:
    tmp = []
    sents = sent_tokenize(i)
    for j in sents:
        doc = nlp(j)
        for chunk in doc.noun_chunks:
            tmp.append(len(chunk.text.split()))
    NL_NS.append(sum(tmp)/len(tmp))

X_NNS = []

X_NS = []

for i,j in zip(CpS_NNS,NL_NNS):
    X_NNS.append([i,j])
    
for i,j in zip(CpS_NS,NL_NS):
    X_NS.append([i,j])

X = X_NNS + X_NS

# パッケージのimport
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

import matplotlib.pyplot as plt
%matplotlib inline

N = []
S = []
X_train,X_test,y_train,y_test = train_test_split(X,Y,test_size=0.2)
for i in range(1,31):
    
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_train,y_train)
    score = knn.score(X_test,y_test)
    N.append(i)
    S.append(score)

plt.xlabel("number_of_k")
plt.ylabel("accuracy")
plt.plot(N,S)

[<matplotlib.lines.Line2D at 0x3024368e0>]

学習者言語の分析（基礎）2（第5回）

4.2 依存関係に関する特徴量を抽出する¶

4.2.1 節の抽出¶

4.2.2 名詞句の抽出¶

4.3 依存関係に関する特徴量を用いた自動採点¶

練習問題¶