import os
# ファイル名の抽出
F = os.listdir("../DATA01/text01/")
F.sort()
# 作文ファイルの読み込み
# Tはひとつの要素が学習者ひとりひとりが書いた作文

T = []

for f in F:
    t = open("../DATA01/text01/"+f,'r')
    text = t.read()
    T.append(text)

# sent_tokenize、word_tokenizeのimport
from nltk import sent_tokenize, word_tokenize

# 文の数を数える
SENT = []

for t in T:
    SENT.append(len(sent_tokenize(t)))

# 単語の数を数える
WORD = []

for t in T:
    WORD.append(len(word_tokenize(t)))

# 文字数を数える
LETTER = []

for t in T:
    s = "".join(t)
    LETTER.append(len(s))

# pandasのimport
import pandas as pd

# データフレームの作成
# 「"列名":データを含むリスト」をひとつのセットして書く
# "index"は行名。ここではファイル名が保存してあるFを指定した
data = pd.DataFrame({"num_of_sents":SENT,
                                        "num_of_words":WORD,
                                        "num_of_letter":LETTER},
                                       index=F)

# DataFrameの中身の確認
# 最初の5行が表示される
data.head()

# 列へのアクセス
data["num_of_letter"]

JPN002.txt    1969
JPN004.txt    1555
JPN006.txt    3042
JPN008.txt    1747
JPN010.txt    1987
JPN011.txt    2331
JPN012.txt    1160
JPN015.txt    1990
JPN022.txt    1466
JPN038.txt    1415
JPN053.txt    2328
JPN060.txt    3080
JPN061.txt    1252
JPN062.txt    1852
JPN078.txt    4893
JPN082.txt    2891
JPN086.txt    1295
JPN090.txt    2249
JPN092.txt    2306
JPN093.txt    1417
JPN095.txt    1790
JPN117.txt    2000
JPN120.txt    1357
JPN122.txt    2757
JPN126.txt    1958
JPN132.txt    2124
JPN135.txt    2115
JPN137.txt    1299
JPN139.txt    1816
JPN142.txt    2789
JPN145.txt    1569
JPN147.txt    2499
JPN148.txt    1606
JPN155.txt    1988
JPN156.txt    3162
JPN157.txt    3975
JPN159.txt    2828
JPN163.txt    2574
JPN164.txt    2530
JPN166.txt    2277
JPN168.txt    2277
JPN169.txt    2214
JPN173.txt    2374
JPN176.txt    1359
JPN179.txt    1388
JPN186.txt    1560
JPN189.txt    2619
JPN191.txt    1794
JPN198.txt    1438
Name: num_of_letter, dtype: int64

# 行へのアクセス
data.loc["JPN002.txt"]

num_of_sents       25
num_of_words      392
num_of_letter    1969
Name: JPN002.txt, dtype: int64

# 列の平均
data.mean()

num_of_sents       25.857143
num_of_words      419.877551
num_of_letter    2127.775510
dtype: float64

# 条件指定: 500語より多くの語が含まれている作文
data[data["num_of_words"]>500]

# ソート: 文の数が少ない順
# by = ""で列名を指定
data.sort_values(by="num_of_sents")

# 表示する小数点以下の桁数を設定
pd.options.display.precision = 2

# 基本統計量
data.describe()

# count -> データの数
# mean -> 平均値
# std -> 標準偏差（個々のデータが平均からどれだけばらついているかの指標）
# min -> 最小値
# 25% -> 第一四分位数（データを小さい方から並べて全体の25%のところにあるデータ）
# 50% -> 中央値（データを小さい方から並べて全体の50%のところにあるデータ）
# 75% -> 第三四分位数（データを小さい方から並べて全体の75%のところにあるデータ）
# max -> 最大値

# 相関係数
data.corr()

# 列の追加
# ここのファイルで１単語あたりの平均文字数を算出して追加
# 単語に含まれる文字数が多いとその単語は難しく、少ないと簡単（一般論）

data["ave_of_word"] = data["num_of_letter"] / data["num_of_words"]

# 列の追加を確認
data.head()

# 可視化のために使用するmatplotlibをimport
import matplotlib.pyplot as plt
# jupyter notebook上で表示するための設定
%matplotlib inline

# 1文字あたりの平均語数のヒストグラム
data["ave_of_word"].plot(kind="hist")

# ヒストグラムは
# 高さが度数（人数とか）。
# 棒グラフと似てるけど違う。
# 量的データの全体的な傾向を表す。

<AxesSubplot: ylabel='Frequency'>

# 散布図は
# ２つの変数の関係を視覚化する。
# ここでは横軸が総文数、縦軸が１単語あたりの平均文字数
# 難しい単語を知っているからといって、長い文が書けるわけではなさそう。
data.plot(kind="scatter",x="num_of_sents",y="ave_of_word")

<AxesSubplot: xlabel='num_of_sents', ylabel='ave_of_word'>

# 語数の箱ひげ図
# 箱ひげ図も全体的な傾向を示す
data["num_of_words"].plot(kind="box")

<AxesSubplot: >

# 散布図行列
# 散布図とヒストグラムを同時に示した図

from pandas.plotting import scatter_matrix

scatter_matrix(data,alpha=0.5)

array([[<AxesSubplot: xlabel='num_of_sents', ylabel='num_of_sents'>,
        <AxesSubplot: xlabel='num_of_words', ylabel='num_of_sents'>,
        <AxesSubplot: xlabel='num_of_letter', ylabel='num_of_sents'>,
        <AxesSubplot: xlabel='ave_of_word', ylabel='num_of_sents'>],
       [<AxesSubplot: xlabel='num_of_sents', ylabel='num_of_words'>,
        <AxesSubplot: xlabel='num_of_words', ylabel='num_of_words'>,
        <AxesSubplot: xlabel='num_of_letter', ylabel='num_of_words'>,
        <AxesSubplot: xlabel='ave_of_word', ylabel='num_of_words'>],
       [<AxesSubplot: xlabel='num_of_sents', ylabel='num_of_letter'>,
        <AxesSubplot: xlabel='num_of_words', ylabel='num_of_letter'>,
        <AxesSubplot: xlabel='num_of_letter', ylabel='num_of_letter'>,
        <AxesSubplot: xlabel='ave_of_word', ylabel='num_of_letter'>],
       [<AxesSubplot: xlabel='num_of_sents', ylabel='ave_of_word'>,
        <AxesSubplot: xlabel='num_of_words', ylabel='ave_of_word'>,
        <AxesSubplot: xlabel='num_of_letter', ylabel='ave_of_word'>,
        <AxesSubplot: xlabel='ave_of_word', ylabel='ave_of_word'>]],
      dtype=object)

# csvからデータの読み込み
eva = pd.read_csv("../DATA02/eva.csv",index_col=0)
eva.head()

import re
# 必要なデータの取り出し

f_names = []

for f in F:
    obj = re.sub(r"\.txt","",f)
    f_names.append(obj)

EVA = []

for i in f_names:
    EVA.append(int(eva.loc[i]))

/var/folders/00/8293wff96nvck2xsxdt2vc4w0000gn/T/ipykernel_42795/3818513998.py:13: FutureWarning: Calling int on a single element Series is deprecated and will raise a TypeError in the future. Use int(ser.iloc[0]) instead
  EVA.append(int(eva.loc[i]))

# 既存のデータフレームに評価の列の追加
data["eva"] = EVA
data.head()

# 平均値
data.groupby("eva").mean()

	num_of_sents	num_of_words	num_of_letter
count	49.00	49.00	49.00
mean	25.86	419.88	2127.78
std	10.03	143.22	727.56
min	10.00	230.00	1160.00
25%	19.00	304.00	1560.00
50%	24.00	406.00	1990.00
75%	27.00	477.00	2499.00
max	64.00	1013.00	4893.00

	num_of_sents	num_of_words	num_of_letter
num_of_sents	1.00	0.72	0.62
num_of_words	0.72	1.00	0.98
num_of_letter	0.62	0.98	1.00

用語	意味
最小値	データの中で最も小さい値
最大値	データの中で最も大きい値
第一四分位数(1Q)	データを小さい方から並べて小さい方から25%のところ
中央値	データを小さい方から並べて小さい方から50%のところ
第三四分位数 (3Q)	データを小さい方から並べて小さい方から75%のところ
四分位範囲 (IQR)	第一四分位数 - 第三四分位数
外れ値	四分位範囲±1.5×四分位範囲に入らない値

	num_of_sents	num_of_words	num_of_letter	ave_of_word
eva
2	28.71	353.86	1636.86	4.68
3	23.00	363.13	1854.48	5.13
4	29.14	497.50	2471.07	5.00
5	25.80	556.00	3111.00	5.59

学習者言語の分析（基礎）1（第4回）

4.1 Pandasとは¶

4.2 データの抽出¶

4.3 Pandasの基本操作¶

4.4 Pandasを用いたデータの可視化¶

4.5 データの読み込みと追加¶

4.6 カテゴリごとの統計量¶

練習問題¶

	num_of_sents	num_of_words	num_of_letter
JPN002.txt	25	392	1969
JPN004.txt	18	288	1555
JPN006.txt	26	548	3042
JPN008.txt	21	332	1747
JPN010.txt	18	391	1987

	num_of_sents	num_of_words	num_of_letter
JPN006.txt	26	548	3042
JPN060.txt	53	674	3080
JPN078.txt	64	1013	4893
JPN082.txt	24	584	2891
JPN092.txt	27	502	2306
JPN142.txt	33	573	2789
JPN156.txt	30	566	3162
JPN157.txt	33	702	3975
JPN159.txt	27	528	2828
JPN164.txt	29	536	2530

	num_of_sents	num_of_words	num_of_letter
JPN198.txt	10	249	1438
JPN179.txt	13	246	1388
JPN189.txt	15	465	2619
JPN186.txt	15	311	1560
JPN137.txt	16	240	1299
JPN011.txt	18	462	2331
JPN004.txt	18	288	1555
JPN010.txt	18	391	1987
JPN012.txt	19	230	1160
JPN145.txt	19	314	1569
JPN061.txt	19	264	1252
JPN139.txt	19	325	1816
JPN120.txt	19	272	1357
JPN148.txt	20	304	1606
JPN163.txt	20	448	2574
JPN008.txt	21	332	1747
JPN022.txt	21	302	1466
JPN176.txt	21	280	1359
JPN169.txt	22	452	2214
JPN062.txt	23	369	1852
JPN168.txt	23	458	2277
JPN155.txt	24	381	1988
JPN086.txt	24	292	1295
JPN038.txt	24	285	1415
JPN082.txt	24	584	2891
JPN173.txt	25	478	2374
JPN166.txt	25	460	2277
JPN002.txt	25	392	1969
JPN122.txt	25	499	2757
JPN015.txt	25	406	1990
JPN006.txt	26	548	3042
JPN090.txt	26	458	2249
JPN126.txt	27	388	1958
JPN191.txt	27	355	1794
JPN092.txt	27	502	2306
JPN159.txt	27	528	2828
JPN093.txt	27	286	1417
JPN147.txt	28	477	2499
JPN164.txt	29	536	2530
JPN156.txt	30	566	3162
JPN053.txt	32	472	2328
JPN157.txt	33	702	3975
JPN142.txt	33	573	2789
JPN135.txt	36	441	2115
JPN095.txt	41	430	1790
JPN132.txt	45	440	2124
JPN117.txt	46	406	2000
JPN060.txt	53	674	3080
JPN078.txt	64	1013	4893

	num_of_sents	num_of_words	num_of_letter	ave_of_word
JPN002.txt	25	392	1969	5.02
JPN004.txt	18	288	1555	5.40
JPN006.txt	26	548	3042	5.55
JPN008.txt	21	332	1747	5.26
JPN010.txt	18	391	1987	5.08

	evaluation
JPN002	3
JPN004	3
JPN006	5
JPN008	3
JPN010	3