import os
os.getcwd()


# みなさんのディレクトリのひとつ上の階層にある"DATA01"というディレクトリの中にある"text01"という
# ディレクトリの中にあるJPN002.txtを読み込んで表示します。

# open関数を用いて指定したファイルを開きます。
# 第一引数はファイルの位置を示し、第二引数の"r"は「読み込みモード」(read)であることを示します。
f = open("../DATA01/text01/JPN002.txt","r")

# read()メソッドを用いてファイルの中身を読み込みます。
text = f.read()

# 開いたファイルは閉じます。
f.close()

# "text"という変数に開いたファイルの中身が保存されているか確認します。
text

"THE JAPANESE SCHOOL EDUCATION In Japan, education system is changing fast now. The content of education is reduced and students come to have free time more. Furthermore, 'total education time' is taken in all Japanese junior high school. I think this change is bad and Japanese government must change it to original form rapidly for the following reasons. Firstly, many young people this time cannot read or write basic words (Japanese 'kanji.') And, they cannot calculate, too. These things are need in daily life, even if they don't go to college or university. Originally, Japanese student got better score in reading and calculation than any other country's student few decades ago. For, reading, writing, and calculation were very important in Japanese society. Now, however, this good value in old Japan is being reduced. This is very large problem in Japan. Secondly, there is deep gap between the level of high school education and university education. Many students who don't learn the content of high school education cannot catch up with the class in universities. Furthermore, for example, I am medical student, but I don't learn biology in high school. And there are many students like me. In addition, the care of university to us is nearly nothing. So, the level of the study in technology, medicine and so is going down. This is very large problem in Japan, too. Thirdly, as the content of school education is reduced, at the same time, the curiosity of students seems reduced. The new idea and new device are coming from the curiosity, I think. So, the reduction of it means the down of possibility that the evolutional change in various field will happen. This is very large problem in Japan. In conclusion, there are problems like these in Japan, because of the reduction of basic education. Luckily, the Japanese government is planning to change the education system. I hope this change will be going back to old Japanese school education system. \n"


#カレントディレクトリ("./")にtext.txtというファイルを作って書き込みます。

# open関数を用いて書き込みモード(w)でカレントディレクトリに"sample.txt"というファイルを作成します。
g = open("./sample.txt",'w')

# writeメソッドの第一引数に書き込みたいテキストを示します。
g.write("This is a sample text.")

# 開いたファイルは閉じます。
g.close()
# 自分のディレクトリに指定したファイルが生成されているか確認してください。


# os.listdirを用いて対象となるディレクトリ内にあるファイルのリストを取得する。

import os

F = os.listdir("../DATA01/text01/")
print(F)

['JPN095.txt', 'JPN122.txt', 'JPN137.txt', 'JPN082.txt', 'JPN135.txt', 'JPN120.txt', 'JPN093.txt', 'JPN078.txt', 'JPN092.txt', 'JPN086.txt', 'JPN090.txt', 'JPN053.txt', 'JPN132.txt', 'JPN126.txt', 'JPN155.txt', 'JPN169.txt', 'JPN168.txt', 'JPN008.txt', 'JPN022.txt', 'JPN156.txt', 'JPN142.txt', 'JPN157.txt', 'JPN147.txt', 'JPN191.txt', 'JPN145.txt', 'JPN179.txt', 'JPN186.txt', 'JPN148.txt', 'JPN015.txt', 'JPN163.txt', 'JPN176.txt', 'JPN189.txt', 'JPN002.txt', 'JPN006.txt', 'JPN012.txt', 'JPN166.txt', 'JPN173.txt', 'JPN198.txt', 'JPN011.txt', 'JPN159.txt', 'JPN164.txt', 'JPN004.txt', 'JPN010.txt', 'JPN038.txt', 'JPN117.txt', 'JPN062.txt', 'JPN060.txt', 'JPN061.txt', 'JPN139.txt']


# forを用いて複数のファイルを読み込みます。

# 読み込んだファイルの中身を保存するリストを作成します。
T = []

# 以下のfor文ではFに保存されているファイル名がfに代入され
# open関数の第一引数として渡されます。
# そして、読み込んだファイルの中身(text)がTにappendで追加されます。
for i in F:
    f = open("../DATA01/text01/"+i,'r')
    text = f.read()
    T.append(text)
    f.close()

# Tの中身を確認してみましょう。


# NLTKというパッケージのsent_tokenize, word_tokenizeという関数を利用します。

# パッケージのimport
from nltk import sent_tokenize,word_tokenize

# sent_tokenizeとword_tokenizeの動作

text = "Our meeting was not a coincidence. Nothing happens by accident."

# sent_tokenizeは文章を文ごとに区切ってリストを返します。
sent_tokenize(text)

['Our meeting was not a coincidence.', 'Nothing happens by accident.']


# word_tokenizeは文を単語に区切って、単語のリストを返します。
word_tokenize(text)

['Our',
 'meeting',
 'was',
 'not',
 'a',
 'coincidence',
 '.',
 'Nothing',
 'happens',
 'by',
 'accident',
 '.']


# それでは、上で読み込んだTの要素それぞれの総文数、総語数を数えてみましょう。

# 総文数を保存するリスト
SENT = []

# 総語数のカウント
for t in T:
    sents = sent_tokenize(t)
    sent_n = len(sents)
    SENT.append(sent_n)

# 総語数を保存するリスト
WORD = []

# 総語数のカウント
for t in T:
    words = word_tokenize(t)
    word_n = len(words)
    WORD.append(word_n)


# 一文あたりの平均語数を算出する
WPS = []
for w,s in zip(WORD,SENT):
    WPS.append(w/s)


# タイプ・トークン比を算出する

# タイプ・トークン比を保存するリスト
TTR = []

for i in T:
    w = word_tokenize(i)
    ttr = len(set(w))/len(w)
    TTR.append(ttr)


f = open("../DATA01/text02/JPN002.txt",'r')
text = f.read()
f.close()


print(text)

@Begin
@Participants:	JPN002
@PID:	PIDJP002
@Age:	20
@Sex:	M
@YearInSchool:	U1
@Major:	medicine
@StudyHistory:	8
@OtherLanguage:	Spanish=0.6;none=
@Qualification:	TOEIC=620(2005);none=;none=
@Abroad:	none=;none=
@Reading:	2
@Writing:	1
@Listening:	1
@Speaking:	1
@JapaneseEssay:	3
@EnglishEssay:	3
@Difficulty:	4
@Topic:	school education
@TWE:	3;3
@Proctor:	1
@Comments:	
@Coder:	2005-12-21 DataInputBy SAKAUE Tatsuya;
@Version:	2.2.2
*JPN002:	THE JAPANESE SCHOOL EDUCATION
%NTV:	The Japanese education system
%COM:	A more descriptive title would be better. How about something like "Problems will [worsen/continue] without a return to the old Japanese education system" or "Current Japanese education system doesn't measure up"?
%par:
*JPN002:	In Japan, education system is changing fast now.
%NTV:	In Japan, the education system is changing fast.
%COM:	
*JPN002:	The content of education is reduced and students come to have free time more.
%NTV:	The content of material taught has reduced and students have come to have more free time.
%COM:	
*JPN002:	Furthermore, 'total education time' is taken in all Japanese junior high school.
%NTV:	Furthermore, 'total education time' is now given in all junior high schools.
%COM:	"Total education time" is a direct translation of "sogotekina gakushu," but the term in English sounds vague. You may want to include examples for a better explanation.
*JPN002:	I think this change is bad and Japanese government must change it to original form rapidly for the following reasons.
%NTV:	I think these changes are for the worse and the Japanese government must return rapidly to the previous system for several reasons.
%COM:	
%par:
*JPN002:	Firstly, many young people this time cannot read or write basic words (Japanese 'kanji.')
%NTV:	First, many young people at this time cannot read or write basic words in Japanese 'kanji.'
%COM:	
*JPN002:	And, they cannot calculate, too.
%NTV:	They also have trouble doing [basic?] calculations.
%COM:	
*JPN002:	These things are need in daily life, even if they don't go to college or university.
%NTV:	These things will be needed in daily life even if they do not go to college or university.
%COM:	
*JPN002:	Originally, Japanese student got better score in reading and calculation than any other country's student few decades ago.
%NTV:	Japanese students were scoring better a few decades ago in reading and calculation than those in any other country.
%COM:	
*JPN002:	For, reading, writing, and calculation were very important in Japanese society.
%NTV:	These high scores were due to the importance Japanese society placed on reading, writing, and calculation.
%COM:	
*JPN002:	Now, however, this good value in old Japan is being reduced.
%NTV:	Now, however, this good value from old Japan is declining.
%COM:	
*JPN002:	This is very large problem in Japan.
%NTV:	This is a very large problem in Japan.
%COM:	
%par:
*JPN002:	Secondly, there is deep gap between the level of high school education and university education.
%NTV:	Second, there is a wide gap between the levels of high school and university education.
%COM:	
*JPN002:	Many students who don't learn the content of high school education cannot catch up with the class in universities.
%NTV:	Many students who do not learn the content of high school education cannot catch up with the classes in universities.
%COM:	
*JPN002:	Furthermore, for example, I am medical student, but I don't learn biology in high school.
%NTV:	For example, although I am a medical student, I did not learn biology in high school.
%COM:	
*JPN002:	And there are many students like me.
%NTV:	Many students are like me.
%COM:	
*JPN002:	In addition, the care of university to us is nearly nothing.
%NTV:	In addition, universities care nearly nothing for their students.
%COM:	
*JPN002:	So, the level of the study in technology, medicine and so is going down.
%NTV:	Therefore, the levels of study in such fields as technology and medicine are decreasing.
%COM:	I would recommend joining this sentence with the one before it: In addition, universities care nearly nothing for their students, so the levels of study...
*JPN002:	This is very large problem in Japan, too.
%NTV:	This is a very large problem in Japan.
%COM:	Unless you want to explain why this is a very large problem in Japan, this sentence does little to strengthen your argument.
%par:
*JPN002:	Thirdly, as the content of school education is reduced, at the same time, the curiosity of students seems reduced.
%NTV:	Third, as the material taught at school is reduced, students' curiosity seems to be declining.
%COM:	
*JPN002:	The new idea and new device are coming from the curiosity, I think.
%NTV:	New ideas and devices come from curiosity.
%COM:	
*JPN002:	So, the reduction of it means the down of possibility that the evolutional change in various field will happen.
%NTV:	Thus, reducing education materials means decreasing the possibility for evolution in various fields.
%COM:	
*JPN002:	This is very large problem in Japan.
%NTV:	This is also a very large problem in Japan.
%COM:	Again, I this sentence doesn't really add anything unless you explain why this is a large problem. You can't assume that the reader will automatically understand, for example, that Japan falling behind in technological innovation could cause economic problems, unemployment, and even social problems.
%par:
*JPN002:	In conclusion, there are problems like these in Japan, because of the reduction of basic education.
%NTV:	In conclusion, reducing basic education has caused serious problems in Japan.
%COM:	
*JPN002:	Luckily, the Japanese government is planning to change the education system.
%NTV:	Fortunately, the Japanese government is planning to change the education system.
%COM:	
*JPN002:	I hope this change will be going back to old Japanese school education system.
%NTV:	I hope that this change will be a return to the old Japanese education system.
%COM:	
@End


# まず、ファイルをそのまま読み込みます

import os
file_names = os.listdir("../DATA01/text02/")

R_TEXT = []

for file in file_names:
    f = open("../DATA01/text02/"+file,'r')
    text = f.read()
    R_TEXT.append(text)


# 読み込んだ作文を改行(\n)で分割します。
S_TEXT = []

for r in R_TEXT:
    text = r.split("\n")
    S_TEXT.append(text)


L = [["AAAA","A","AAAAAA","AA","AAA"],["BB","BBBBB","BBBBBBBB","BBB","BBBBBBB","B"],["C","CCCCC","CC","CCCCC"]]

M = []

for i in L:
    tmp = []
    for j in i:
        tmp.append(len(j))
    M.append(sum(tmp)/len(tmp))
    
M

[3.2, 4.333333333333333, 3.25]


for i in L:
    X = map(len,i)
    X = list(X)
    print(sum(X)/len(X))

3.2
4.333333333333333
3.25


# 正規表現を用いて学習者が書いた作文のみを取り出します。

import re

T_TEXT = []

for text in S_TEXT:
    T = []
    for sent in text:
        obj = re.match(r"\*JPN\d\d\d:\t",sent)
        if obj:
            s = re.sub(r"\*JPN\d\d\d:\t","",sent)
            T.append(s)
    T_TEXT.append(T)

学習者言語の分析（基礎）1（第3回）

3.1ディレクトリの位置¶

3.2 ファイルの読み書き¶

3.2.1 ファイルの読み込み¶

3.2.2 ファイルの書き込み¶

3.2.3 複数ファイルの読み込みとファイルの処理¶

3.3 テキストを処理する¶

3.4 コーパスと前処理¶

3.4.1 NICE 3.0¶

3.4.2 コーパスから必要なデータを取り出す¶

練習問題¶

問題1¶

問題2¶