import os
# ファイル名の抽出
F = os.listdir("../DATA02/txt/")
F.sort()
# 作文ファイルの読み込み
# Tはひとつの要素が学習者ひとりひとりが書いた作文

T = []

for f in F:
    t = open("../DATA02/txt/"+f,'r')
    text = t.read()
    T.append(text)

T[0]

'She enjoyed, ah Rena enjoyed taking violin lessons, but she was getting tired of it. So, her teacher said that she should, uh she should practice playing the violin every day. And her and her parents, ehhh the violin lesson is too expensive to continue if she was going to be lazy. So, he so she so she so she quit the violin lessons. But, several years later, she she went to she went to a violin concert and she was very impressed with she was very impressed by.\n'

# sent_tokenize、word_tokenizeのimport
from nltk import sent_tokenize, word_tokenize

# 文の数を数える
SENT = []

for t in T:
    SENT.append(len(sent_tokenize(t)))

# 単語の数を数える
WORD = []

for t in T:
    WORD.append(len(word_tokenize(t)))

# 文字数を数える
LETTER = []

for t in T:
    s = "".join(t)
    LETTER.append(len(s))

# pandasのimport
import pandas as pd

# データフレームの作成
# 「"列名":データを含むリスト」をひとつのセットして書く
# "index"は行名。ここではファイル名が保存してあるFを指定した
data = pd.DataFrame({"num_of_sents":SENT,
                                        "num_of_words":WORD,
                                        "num_of_letter":LETTER},
                                       index=F)

data.head()

# 基本統計量
data.describe()

data["num_of_sents"].plot(kind="hist")

<AxesSubplot: ylabel='Frequency'>

data["num_of_words"].plot(kind="box")

<AxesSubplot: >

data.plot(kind="scatter",x="num_of_sents",y="num_of_letter")

<AxesSubplot: xlabel='num_of_sents', ylabel='num_of_letter'>

	num_of_sents	num_of_words	num_of_letter
S003.txt	5	102	465
S004.txt	1	33	207
S005.txt	5	76	370
S007.txt	7	93	492
S010.txt	8	139	655

	num_of_sents	num_of_words	num_of_letter
count	118.000000	118.000000	118.000000
mean	3.661017	95.355932	477.110169
std	2.935839	28.414371	134.807605
min	1.000000	22.000000	123.000000
25%	1.000000	78.250000	401.250000
50%	3.000000	95.500000	484.000000
75%	6.000000	114.000000	569.750000
max	11.000000	170.000000	825.000000