学習者言語の分析(基礎)第5回 1
In [2]:
import spacy
nlp = spacy.load("en_core_web_sm")
from nltk import sent_tokenize,word_tokenize
import pandas as pd
import collections
import os
In [3]:
text = "If you put your hand in the water, you can touch the fish."
doc = nlp(text)
for token in doc:
print(token.text,token.tag_)
If IN you PRP put VBP your PRP$ hand NN in IN the DT water NN , , you PRP can MD touch VB the DT fish NN . .
In [4]:
# データの取得
fname_NNS = os.listdir("../DATA02/NICE_NNS/")
fname_NNS.sort()
T_NNS = []
for i in fname_NNS:
f = open("../DATA02/NICE_NNS/"+i,"r")
text = f.read()
f.close()
T_NNS.append(text)
In [5]:
# 語数のカウント
WORDS = []
for i in T_NNS:
w = word_tokenize(i)
WORDS.append(len(w))
In [6]:
POS_NNS = []
for i in T_NNS:
tmp = []
doc = nlp(i)
for token in doc:
tmp.append(token.tag_)
POS_NNS.append(tmp)
In [7]:
IND_POS_NNS = []
for i in POS_NNS:
IND_POS_NNS.append(collections.Counter(i))
In [8]:
tag_types = []
for i in IND_POS_NNS:
for j in i:
if j not in tag_types:
tag_types.append(j)
In [9]:
IND_POS_NNS2 = []
for i in IND_POS_NNS:
k = i.keys()
for j in tag_types:
if j not in k:
i[j] = 0
IND_POS_NNS2.append(i)
In [10]:
for d,n in zip(IND_POS_NNS2,WORDS):
for k,v in d.items():
d[k] = v * (100 / n)
In [11]:
D = {}
for i in tag_types:
D[i] = []
In [12]:
for i in IND_POS_NNS2:
for k,v in i.items():
D[k].append(v)
In [13]:
df = pd.DataFrame(D,index=fname_NNS)
df.head()
Out[13]:
VBZ | NN | RB | VBP | . | PRP | IN | JJ | NNS | WP$ | ... | RBR | CD | : | RBS | PDT | SYM | NFP | LS | XX | $ | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
JPN001.txt | 6.910569 | 17.682927 | 7.317073 | 3.861789 | 6.300813 | 5.691057 | 11.382114 | 7.317073 | 4.878049 | 0.406504 | ... | 0.000000 | 0.000000 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
JPN002.txt | 3.826531 | 18.877551 | 8.163265 | 3.061224 | 6.377551 | 2.806122 | 11.479592 | 10.204082 | 3.061224 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
JPN003.txt | 2.100840 | 13.445378 | 7.142857 | 3.781513 | 6.302521 | 7.563025 | 7.563025 | 10.924370 | 3.781513 | 0.420168 | ... | 0.840336 | 0.840336 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
JPN004.txt | 3.125000 | 25.347222 | 2.777778 | 3.472222 | 5.902778 | 5.902778 | 10.069444 | 7.638889 | 6.944444 | 0.000000 | ... | 0.000000 | 1.388889 | 1.388889 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
JPN005.txt | 1.179245 | 19.811321 | 5.188679 | 3.773585 | 4.716981 | 9.198113 | 12.028302 | 4.952830 | 3.301887 | 0.000000 | ... | 0.235849 | 0.235849 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
5 rows × 48 columns
5.2 依存関係の頻度¶
- 依存関係は文において単語の意味的関係を示すと同時に統語(文法)的関係を示している。
- 依存関係の頻度も品詞の頻度同様、熟達度に関係していると考えることができます。例えば、
- 初学者のみが頻繁に使用する依存関係がある。
- 発達に伴い、頻度が大きく異なる依存関係とほとんど変わらない依存関係がある。
- 以下では、品詞の頻度と同様の手順で依存関係の頻度を抽出します。
In [13]:
DEP = []
for i in T_NNS:
tmp = []
doc = nlp(i)
for token in doc:
tmp.append(token.dep_)
DEP.append(tmp)
In [14]:
IND_DEP = []
for i in DEP:
IND_DEP.append(collections.Counter(i))
In [15]:
dep_types = []
for i in IND_DEP:
for j in i:
if j not in dep_types:
dep_types.append(j)
In [16]:
IND_DEP2 = []
for i in IND_DEP:
k = i.keys()
for j in dep_types:
if j not in k:
i[j] = 0
IND_DEP2.append(i)
In [17]:
for d,n in zip(IND_DEP2,WORDS):
for k,v in d.items():
d[k] = v * (100 / n)
In [18]:
D2 = {}
for i in dep_types:
D2[i] = []
In [19]:
for i in IND_DEP2:
for k,v in i.items():
D2[k].append(v)
In [20]:
df2 = pd.DataFrame(D2,index=fname_NNS)
df2.head()
Out[20]:
aux | compound | nsubj | advmod | ROOT | punct | mark | ccomp | acomp | prep | ... | prt | oprd | nummod | nmod | dative | predet | preconj | csubjpass | quantmod | parataxis | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
JPN001.txt | 4.268293 | 2.032520 | 10.569106 | 5.487805 | 6.300813 | 11.585366 | 3.861789 | 3.455285 | 2.845528 | 6.300813 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
JPN002.txt | 4.336735 | 2.551020 | 7.397959 | 6.632653 | 6.377551 | 14.795918 | 0.765306 | 0.765306 | 0.765306 | 10.459184 | ... | 0.255102 | 0.255102 | 0.000000 | 0.000000 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
JPN003.txt | 4.621849 | 0.840336 | 10.924370 | 7.563025 | 6.722689 | 11.344538 | 2.100840 | 1.260504 | 2.100840 | 5.042017 | ... | 0.000000 | 0.420168 | 0.420168 | 0.000000 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
JPN004.txt | 5.902778 | 13.541667 | 9.027778 | 3.125000 | 6.250000 | 12.152778 | 2.430556 | 1.388889 | 1.388889 | 7.638889 | ... | 0.347222 | 0.000000 | 1.041667 | 0.347222 | 0.347222 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
JPN005.txt | 6.603774 | 4.952830 | 11.084906 | 5.660377 | 4.716981 | 6.367925 | 3.537736 | 2.830189 | 2.594340 | 8.018868 | ... | 0.000000 | 0.000000 | 0.000000 | 0.235849 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
5 rows × 44 columns
依存関係の詳細についてはUniversal Dependency Relationsを参照。