第6回の演習問題の解答例¶
../DATA01/text_train/
には学習者の作文が保存されています。- これらの作文に対する評価は
../DATA02/eva_train.csv
に保存されています。 - 評価はGood=1、Poor=0のように2値で付与されています。
- このデータを用いて自動採点システムを構築し、
../DATA01/text_test/
に保存されている作文を評価します。 - 評価に関して以下ような予想をし、これを前提に自動採点システムを構築します。
- 評価が高い作文には複雑な文が含まれている→文の平均語数が多い
- 評価が高い作文には難しい(頻度が低い)単語が含まれている→単語の平均文字数が多い
- 評価が高い作文にはさまざまな単語が含まれている→TTRの値が大きい
- また、特徴量の値が近いと同じ評価を受けると考えます。
In [1]:
import os
from nltk import word_tokenize,sent_tokenize
import pandas as pd
import numpy as np
In [2]:
# ファイルの読み込み
F_train = os.listdir("../DATA01/text_train/")
T_train = []
for i in F_train:
f = open("../DATA01/text_train/"+i,"r")
text = f.read()
f.close()
T_train.append(text)
In [3]:
WPS_train = []
for i in T_train:
s = len(sent_tokenize(i))
w = len(word_tokenize(i))
wps = w/s
WPS_train.append(wps)
In [4]:
LPW_train = []
for i in T_train:
w = len(word_tokenize(i))
l = len("".join(word_tokenize(i)))
lpw = l/w
LPW_train.append(lpw)
In [5]:
TTR_train = []
for i in T_train:
i = i.lower()
w = word_tokenize(i)
ttr = len(set(w))/len(w)
TTR_train.append(ttr)
In [6]:
data_train = pd.DataFrame({"WPS":WPS_train,"LPW":LPW_train,"TTR":TTR_train},index=F_train)
In [7]:
data_train_norm = pd.DataFrame(index=F_train)
WPS_min = data_train.min()[0]
WPS_max = data_train.max()[0]
LPW_min = data_train.min()[1]
LPW_max = data_train.max()[1]
data_train_norm["WPS_nmd"] = (data_train["WPS"] - WPS_min) / (WPS_max - WPS_min)
data_train_norm["LPW_nmd"] = (data_train["LPW"] - LPW_min) / (LPW_max - LPW_min)
data_train_norm["TTR"] = data_train["TTR"]
/var/folders/nz/_yfjsvk16fbcf6x8w1wcsf1w0000gn/T/ipykernel_10536/4216469562.py:3: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]` WPS_min = data_train.min()[0] /var/folders/nz/_yfjsvk16fbcf6x8w1wcsf1w0000gn/T/ipykernel_10536/4216469562.py:4: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]` WPS_max = data_train.max()[0] /var/folders/nz/_yfjsvk16fbcf6x8w1wcsf1w0000gn/T/ipykernel_10536/4216469562.py:5: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]` LPW_min = data_train.min()[1] /var/folders/nz/_yfjsvk16fbcf6x8w1wcsf1w0000gn/T/ipykernel_10536/4216469562.py:6: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]` LPW_max = data_train.max()[1]
In [8]:
eva_train = pd.read_csv("../DATA02/eva_train.csv",index_col=0)
data_train_norm["evaluation"] = eva_train["evaluation"]
In [9]:
# ファイルの読み込み
F_test = os.listdir("../DATA01/text_test/")
T_test = []
for i in F_test:
f = open("../DATA01/text_test/"+i,"r")
text = f.read()
f.close()
T_test.append(text)
WPS_test = []
for i in T_test:
s = len(sent_tokenize(i))
w = len(word_tokenize(i))
wps = w/s
WPS_test.append(wps)
LPW_test = []
for i in T_test:
w = len(word_tokenize(i))
l = len("".join(word_tokenize(i)))
lpw = l/w
LPW_test.append(lpw)
TTR_test = []
for i in T_test:
i = i.lower()
w = word_tokenize(i)
ttr = len(set(w))/len(w)
TTR_test.append(ttr)
data_test = pd.DataFrame({"WPS":WPS_test,"LPW":LPW_test,"TTR":TTR_test},index=F_test)
data_test_norm = pd.DataFrame(index=F_test)
WPS_min = data_test.min()[0]
WPS_max = data_test.max()[0]
LPW_min = data_test.min()[1]
LPW_max = data_test.max()[1]
data_test_norm["WPS_nmd"] = (data_test["WPS"] - WPS_min) / (WPS_max - WPS_min)
data_test_norm["LPW_nmd"] = (data_test["LPW"] - LPW_min) / (LPW_max - LPW_min)
data_test_norm["TTR"] = data_test["TTR"]
/var/folders/nz/_yfjsvk16fbcf6x8w1wcsf1w0000gn/T/ipykernel_10536/4263217944.py:40: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]` WPS_min = data_test.min()[0] /var/folders/nz/_yfjsvk16fbcf6x8w1wcsf1w0000gn/T/ipykernel_10536/4263217944.py:41: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]` WPS_max = data_test.max()[0] /var/folders/nz/_yfjsvk16fbcf6x8w1wcsf1w0000gn/T/ipykernel_10536/4263217944.py:42: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]` LPW_min = data_test.min()[1] /var/folders/nz/_yfjsvk16fbcf6x8w1wcsf1w0000gn/T/ipykernel_10536/4263217944.py:43: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]` LPW_max = data_test.max()[1]
- ↑のセルまでは授業中に説明したのでここでは割愛します。
- 以下のコードを説明しないのはあまりに不親切だと思いましたので、行間に説明を挿入します。
In [10]:
# テストデータと訓練データの行名(JPN095.txtなど)を取得します。
N_test = data_test_norm.index
N_train = data_train_norm.index
# 処理した値を保存するリスト
L = []
# テストデータの行名をiに代入して個々に処理
for i in N_test:
# data_test_normの各行の特徴量をtest_dataに代入
test_data = data_test_norm.loc[i]
# 入れ子のfor文の処理を一時的に保存するリスト
tmp = []
# 訓練データの行名をjに代入して個々に処理
for j in N_train:
#data_train_normの各行の特徴量をtrain_dataに代入
train_data = data_train_norm.loc[j][:3]
# i(テストデータ)とj(訓練データ)とのユークリッド距離を計算
tmp.append(np.linalg.norm(test_data.values - train_data.values))
# ひとつのi(テストデータ)に対してすべてのj(訓練データ)との距離を計算し終えたら結果をLに保存
L.append(tmp)
In [11]:
# ユークリッド距離を比較するためのDataFrame
# 行(index)が訓練データ、列(column)がテストデータ)
# JPN137.txt行、JPN095.txt列はJPN137.txtとJPN095.txtのユークリッド距離
data_compare = pd.DataFrame(index=F_train)
for i,j in zip(F_test,L):
data_compare[i] = j
# evaluationを追加
data_compare["evaluation"] = data_train_norm["evaluation"]
In [12]:
data_compare.head()
Out[12]:
JPN095.txt | JPN122.txt | JPN082.txt | JPN120.txt | JPN093.txt | JPN092.txt | JPN086.txt | JPN090.txt | JPN126.txt | JPN117.txt | evaluation | |
---|---|---|---|---|---|---|---|---|---|---|---|
JPN137.txt | 0.733757 | 0.555997 | 0.776568 | 0.169252 | 0.161726 | 0.590981 | 0.554092 | 0.356668 | 0.225830 | 0.306765 | 0 |
JPN135.txt | 0.202230 | 1.008077 | 0.964199 | 0.471182 | 0.461867 | 0.521884 | 0.119873 | 0.594353 | 0.522778 | 0.409769 | 0 |
JPN078.txt | 0.286770 | 0.900292 | 0.803053 | 0.396077 | 0.478505 | 0.352413 | 0.111903 | 0.464618 | 0.448077 | 0.469036 | 1 |
JPN053.txt | 0.360625 | 0.822105 | 0.804544 | 0.285335 | 0.336614 | 0.401562 | 0.172244 | 0.411259 | 0.354920 | 0.345575 | 1 |
JPN132.txt | 0.328395 | 0.999020 | 1.036634 | 0.459461 | 0.368285 | 0.630646 | 0.261303 | 0.629197 | 0.503725 | 0.280036 | 1 |
In [13]:
# 自動採点システム
# F_testにはテストデータのファイル名が入っています。
# テストデータのファイル名の列を昇順でソートします。
# そのテストデータに距離が近い上位7つの訓練データの"evaluation"の合計が3より大きい、つまり、
# そのテストデータに近いところにある訓練データについている評価の半分以上が1のとき、テストデータの
# 評価は1、それ以外は0という仕組み。
F_test.sort()
E = []
for i in F_test:
e = sum(data_compare.sort_values(by=i).head(7)["evaluation"].values)
if e > 3:
f = 1
else:
f = 0
E.append(f)
In [14]:
# 自動採点の結果
E
Out[14]:
[0, 0, 0, 0, 0, 0, 0, 0, 1, 0]
In [15]:
# これらのテストデータには本当は評価が付与されてました。
eva_test = pd.read_csv("../DATA02/eva_test.csv",index_col=0)
eva_test
Out[15]:
evaluation | |
---|---|
JPN082.txt | 1 |
JPN086.txt | 0 |
JPN090.txt | 1 |
JPN092.txt | 0 |
JPN093.txt | 0 |
JPN095.txt | 0 |
JPN117.txt | 0 |
JPN120.txt | 0 |
JPN122.txt | 1 |
JPN126.txt | 0 |