# 使用するライブラリのimport
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

# データの生成
np.random.seed(100)
X1 = np.random.normal(7,5,100)
Y1 = np.random.normal(15,5,100)
X2 = np.random.normal(15,5,100)
Y2 = np.random.normal(4,4,100)

plt.scatter(X1,Y1,label="F")
plt.scatter(X2,Y2,label="P")
plt.legend()

<matplotlib.legend.Legend at 0x151576a30>

plt.scatter(X1,Y1,label="F")
plt.scatter(X2,Y2,label="P")
plt.legend()
plt.scatter(7,2,marker="$NEW$",s=500)

<matplotlib.collections.PathCollection at 0x1516fb250>

x1_ave = np.average(X1)
y1_ave = np.average(Y1)

x2_ave = np.average(X2)
y2_ave = np.average(Y2)

plt.scatter(X1,Y1,label="F")
plt.scatter(X2,Y2,label="P")
plt.scatter(x1_ave,y1_ave,marker="*",s=500,color="white",edgecolors="k")
plt.scatter(x2_ave,y2_ave,marker="*",s=500,color="white",edgecolors="k")
plt.scatter(7,2,marker="$NEW$",s=500)
plt.legend()

<matplotlib.legend.Legend at 0x15200fac0>

F_distance = np.sqrt((7 - x1_ave)**2 + (2 - y1_ave)**2)
P_distance =  np.sqrt((7 - x2_ave)**2 + (2 - y2_ave)**2)
print(F_distance,P_distance)

12.659487545459246 8.8583183121659

# 特徴量xと特徴量yをまとめます

F = []

for i,j in zip(X1,Y1):
    F.append(np.array([i,j]))
    
P = []

for i,j in zip(X2,Y2):
    P.append(np.array([i,j]))

# Fクラスの分割
test_F,train_F = F[:20],F[20:]

# Pクラスの分類
test_P,train_P = P[:20],P[20:]

# テストデータのラベルの作成
N = test_F + test_P
L = [0]*20+[1]*20

# それぞれのクラスで平均値を計算

x = 0
y = 0

for i,j in train_F:
    x += i
    y += j
    
F_ave = np.array([x/80,y/80])

x = 0
y = 0

for i,j in train_P:
    x += i
    y += j
    
P_ave = np.array([x/80,y/80])

plt.figure()

for i in train_F:
    plt.scatter(i[0],i[1],color="b")
    
for i in train_P:
    plt.scatter(i[0],i[1],color="r")
    
plt.scatter(F_ave[0],F_ave[1],marker="*",s=500,color="y",edgecolors="k")
plt.scatter(P_ave[0],P_ave[1],marker="*",s=500,color="y",edgecolors="k")

for i in N:
    plt.scatter(i[0],i[1],color="None",edgecolors="k")

# 自動採点システムによる評価を取得

y_pred = []

for i in N:
    f = np.linalg.norm(F_ave - i)
    p = np.linalg.norm(P_ave - i)
    if p < f:
        y_pred.append(1)
    else:
        y_pred.append(0)

print(y_pred)

[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1]

print(L)

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

D = {"a":0,"b":0,"c":0,"d":0}

for i, j in zip(L,y_pred):
    if i == 1 and j == 1:
        D["a"] += 1
    elif i == 1 and j == 0:
        D["b"] += 1
    elif i == 0 and j == 1:
        D["c"] += 1
    else:
        D["d"] += 1

df = pd.DataFrame({"positive":[D["a"],D["c"]],"negative":[D["b"],D["d"]]},index=["true","false"])
df

# 正確度
(D["a"]+D["d"])/(D["a"]+D["b"]+D["c"]+D["d"])

0.875

# 偽陰性率（自動採点の「厳しさ」）
D["b"]/(D["a"]+D["b"])

0.15

# 偽陽性率（自動採点の「甘さ」）
D["c"]/(D["c"]+D["d"])

0.1

import random

L = [1,2,3,4,5]
L_shuffle = random.sample(L,len(L))
L_shuffle

[4, 5, 2, 1, 3]

本当\自動	合格	不合格
合格	a	b
不合格	c	d

学習者言語の分析（基礎）2 第1回

1.3 簡単な自動採点システム¶

1.4 自動採点システムの予測精度の評価¶

1.4.1 交差検証法¶

1.4.2 予測精度の評価指標¶

練習問題¶

	positive	negative
true	17	3
false	2	18