# 今回使用するパッケージ
import pandas as pd

# 以下は可視化（グラフの描画）に使用するmatplotlibというパッケージをimportしてます。
import matplotlib.pyplot as plt

# 以下はグラフをjupyter notebook上に表示するためのマジックコマンドと呼ばれるコマンドです。
%matplotlib inline

# データの読み込み
# 約200人の受験者が英語のReading(R)、Listening(L)、Writing(W)のテストを受験した結果です。
data = pd.read_csv("../DATA01/IEDA05_01.csv",index_col=0)
data.head()

# 引数でグラフの種類(kind="hist")とグラフにしたい列名を指定する(y=["R"])
data.plot(y=["R","L"],kind="hist",alpha=0.5,bins=25)

<AxesSubplot: ylabel='Frequency'>

# 列名を指定しないとすべての列が描画される。
data.plot(kind="hist")

<AxesSubplot: ylabel='Frequency'>

# 上のグラフは少し見にくいので
# binsで階級数、alphaでグラフの透明度を設定する。
data.plot(y=["R","L"],kind="hist",bins=22,alpha=0.5)

<AxesSubplot: ylabel='Frequency'>

# xで横軸の値、yで縦軸の値を指定する。
data.plot(x="L",y="R",kind="scatter")

<AxesSubplot: xlabel='L', ylabel='R'>

# cで3番目の値を指定するとドットの色の濃さが3番目で指定した
data.plot(x="L",y="R",kind="scatter",c="W")

<AxesSubplot: xlabel='L', ylabel='R'>

from pandas.plotting import scatter_matrix
scatter_matrix(data,alpha=0.5)

array([[<AxesSubplot: xlabel='R', ylabel='R'>,
        <AxesSubplot: xlabel='L', ylabel='R'>,
        <AxesSubplot: xlabel='W', ylabel='R'>],
       [<AxesSubplot: xlabel='R', ylabel='L'>,
        <AxesSubplot: xlabel='L', ylabel='L'>,
        <AxesSubplot: xlabel='W', ylabel='L'>],
       [<AxesSubplot: xlabel='R', ylabel='W'>,
        <AxesSubplot: xlabel='L', ylabel='W'>,
        <AxesSubplot: xlabel='W', ylabel='W'>]], dtype=object)

# 使用するデータ
data2 = pd.read_csv("../DATA01/IEDA05_02.csv",index_col=0)
data2.head()

# 以下では列名"Class"でデータを分割してクラスごとの平均値を算出している。
# group()の引数に列名を与えることでデータを分割する
data2.groupby("Class").mean()

# boxplot()の引数columnに列名を与え、Class AとClass Bの分布を箱ひげ図で確認することができる。
data2.groupby("Class").boxplot(column="item2")

A         AxesSubplot(0.1,0.15;0.363636x0.75)
B    AxesSubplot(0.536364,0.15;0.363636x0.75)
dtype: object

# 特定の列を指定することも可能（出力に警告が含まれますが気にしないでください）
data2.groupby("Class")["item2","item3"].mean()

/var/folders/rt/wv_48tq925z8xx5ffnwfq01r0000gn/T/ipykernel_44782/732549167.py:2: FutureWarning: Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.
  data2.groupby("Class")["item2","item3"].mean()

data2.head()

# 合計得点のSeriesを作ります。
s_total = data2.sum(numeric_only=True,axis=1)
s_total

s01    37
s02    23
s03    26
s04    27
s05    23
s06    14
s07    30
s08    19
s09    13
s10    29
s11    20
s12    48
s13    30
s14    39
s15    48
s16    35
s17    28
s18    18
s19    24
s20    48
dtype: int64

# 合計得点が上位25%だったら3、下位25%だったら1、
# その間だったら2というリストGを作ります。

G = []

upper = s_total.quantile(.75)
lower = s_total.quantile(.25)

for i in s_total:
    if i > upper:
        G.append(3)
    elif i < lower:
        G.append(1)
    else:
        G.append(2)

G

[3, 2, 2, 2, 2, 1, 2, 1, 1, 2, 1, 3, 2, 3, 3, 2, 2, 1, 2, 3]

# data2にGを入れる
data2["G_P"] = G

data2.groupby("G_P").mean(numeric_only=True)

# グループ分けした平均値を折れ線グラフで示すと以下のようになります。
# 上の表と下のグラフを見比べてみてください。
data2.groupby("G_P").mean(numeric_only=True).plot(kind="line")

<AxesSubplot: xlabel='G_P'>

# リストで列名を指定すると指定した列のみのグラフが出力されます。
data2.groupby("G_P")["item1","item2"].mean(numeric_only=True).plot(kind="line")

/var/folders/rt/wv_48tq925z8xx5ffnwfq01r0000gn/T/ipykernel_44782/3173011769.py:2: FutureWarning: Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.
  data2.groupby("G_P")["item1","item2"].mean(numeric_only=True).plot(kind="line")

<AxesSubplot: xlabel='G_P'>

# パッケージのimport
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

# csvの読み込み
data = pd.read_csv("../DATA01/IEDA05_03.csv",index_col=0)
data.head()

	R	L	W
S001	156	151	100
S002	166	145	89
S003	175	176	108
S004	235	193	110
S005	136	172	106

	item1	item2	item3	item4	item5	item6	item7	item8
Class
A	3.7	3.1	2.8	2.9	2.7	3.0	2.9	3.0
B	4.6	4.2	4.2	4.2	4.1	4.1	4.3	4.1

	item2	item3
Class
A	3.1	2.8
B	4.2	4.2

	item1	item2	item3	item4	item5	item6	item7	item8
G_P
1	2.8	2.2	1.8	1.8	1.4	1.8	2.4	2.6
2	4.1	3.5	3.4	3.4	3.4	3.5	3.1	3.1
3	5.6	5.4	5.4	5.6	5.4	5.4	5.8	5.4

	Q1	Q2	Q3	Q4	Q5	Q6	Q7	Q8	Q9	Q10	...	Q51	Q52	Q53	Q54	Q55	Q56	Q57	Q59	Q60
S001	1	1	1	1	1	1	1	1	1	1	...	0	1	0	0	1	1	1	1	1
S002	1	1	1	0	1	1	1	1	1	1	...	0	1	1	0	1	0	0	0	0
S003	0	1	1	1	1	1	0	1	1	1	...	1	1	0	0	1	1	1	0	0
S004	1	1	1	1	0	1	1	1	1	1	...	1	1	0	1	1	1	1	1	1
S005	1	0	1	1	1	1	1	1	1	1	...	0	1	0	1	0	0	0	0	0

教育データ分析入門1 （第5回）

5.1 Pandasを用いた可視化¶

5.1.1 準備¶

5.1.2 ヒストグラム¶

5.1.3 散布図¶

5.1.4 散布図行列¶

5.2 Groupbyを用いたグルーピング¶

5.2.1準備¶

5.2.2 Groupby¶

5.3 Good-Poor分析¶

練習問題¶

	item1	item2	item3	item4	item5	item6	item7	item8	Class
s01	5	4	4	5	4	4	6	5	A
s02	4	3	1	3	3	3	2	4	A
s03	4	4	3	3	3	3	3	3	A
s04	5	3	3	3	3	4	3	3	A
s05	3	2	3	3	3	3	3	3	A

	item1	item2	item3	item4	item5	item6	item7	item8	Class
s01	5	4	4	5	4	4	6	5	A
s02	4	3	1	3	3	3	2	4	A
s03	4	4	3	3	3	3	3	3	A
s04	5	3	3	3	3	4	3	3	A
s05	3	2	3	3	3	3	3	3	A

	item1	item2	item3	item4	item5	item6	item7	item8
G_P
1	2.8	2.2	1.8	1.8	1.4	1.8	2.4	2.6
2	4.1	3.5	3.4	3.4	3.4	3.5	3.1	3.1
3	5.6	5.4	5.4	5.6	5.4	5.4	5.8	5.4

	item1	item2	item3	item4	item5	item6	item7	item8	Class
s01	5	4	4	5	4	4	6	5	A
s02	4	3	1	3	3	3	2	4	A
s03	4	4	3	3	3	3	3	3	A
s04	5	3	3	3	3	4	3	3	A
s05	3	2	3	3	3	3	3	3	A

	item1	item2	item3	item4	item5	item6	item7	item8	Class
s01	5	4	4	5	4	4	6	5	A
s02	4	3	1	3	3	3	2	4	A
s03	4	4	3	3	3	3	3	3	A
s04	5	3	3	3	3	4	3	3	A
s05	3	2	3	3	3	3	3	3	A

	item1	item2	item3	item4	item5	item6	item7	item8
G_P
1	2.8	2.2	1.8	1.8	1.4	1.8	2.4	2.6
2	4.1	3.5	3.4	3.4	3.4	3.5	3.1	3.1
3	5.6	5.4	5.4	5.6	5.4	5.4	5.8	5.4

	item1	item2	item3	item4	item5	item6	item7	item8	Class
s01	5	4	4	5	4	4	6	5	A
s02	4	3	1	3	3	3	2	4	A
s03	4	4	3	3	3	3	3	3	A
s04	5	3	3	3	3	4	3	3	A
s05	3	2	3	3	3	3	3	3	A

	item1	item2	item3	item4	item5	item6	item7	item8	Class
s01	5	4	4	5	4	4	6	5	A
s02	4	3	1	3	3	3	2	4	A
s03	4	4	3	3	3	3	3	3	A
s04	5	3	3	3	3	4	3	3	A
s05	3	2	3	3	3	3	3	3	A

	item1	item2	item3	item4	item5	item6	item7	item8
G_P
1	2.8	2.2	1.8	1.8	1.4	1.8	2.4	2.6
2	4.1	3.5	3.4	3.4	3.4	3.5	3.1	3.1
3	5.6	5.4	5.4	5.6	5.4	5.4	5.8	5.4