출처
이탤릭 볼드 이탤릭볼드
Workflow stages
- Question or problem definition.
- Acquire training and testing data.
- Wrangle, prepare, cleanse the data.
- Analyze, identify patterns, and explore the data.
- Model, predict and solve the problem.
- Visualize, report, and present the problem solving steps and final solution.
- Supply or submit the results.
기본적으로 설치되어 있어야하는 패키지는 아래 코드
를 사용한다.
from keras.callbacks import LambdaCallback
from keras.models import Sequential
from keras.layers import Dense, LSTM
from keras.optimizers import RMSprop
from keras.utils.data_utils import get_file
import numpy as np
import random
import sys
import io
import re
전처리
# path = get_file('nietzsche.txt', origin='https://s3.amazonaws.com/text-datasets/nietzsche.txt')
path = 'barkers.txt'
with io.open(path, encoding='utf-8') as f:
text = f.read().lower()
text = re.sub(r'<.*>', '', text)
text = re.sub(r'\n', ' ', text)
text = re.sub(r' +', ' ', text)
# Compute Character Indices
chars = sorted(list(set(text)))
print('total chars:', len(chars))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars)) # 원핫인코딩으로 class화 함
## Vectorize Sentences
maxlen = 40
step = 3
sentences = []
next_chars = []
for i in range(0, len(text) - maxlen, step):
sentences.append(text[i: i + maxlen]) # input 40글자씩
next_chars.append(text[i + maxlen]) # output 1글자
print('nb sequences:', len(sentences))
print('Vectorization...')
x = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool) # input
y = np.zeros((len(sentences), len(chars)), dtype=np.bool) # output
for i, sentence in enumerate(sentences):
for t, char in enumerate(sentence):
x[i, t, char_indices[char]] = 1 # one-hot encording
y[i, char_indices[next_chars[i]]] = 1 # one-hot encording
모델 생성
print('Build model...')
model = Sequential()
model.add(LSTM(1024, input_shape=(maxlen, len(chars))))
model.add(Dense(len(chars), activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer=RMSprop(lr=0.001))
콜백 생성
def sample(preds, temperature=1.0): # Loop를 방지하기 위해 확률을 조작. 긍정문 같은 예, 아니오 등이 반복되는 것을 방지
# helper function to sample an index from a probability array
preds = np.asarray(preds).astype('float64')
preds = np.log(preds) / temperature
exp_preds = np.exp(preds)
preds = exp_preds / np.sum(exp_preds)
probas = np.random.multinomial(1, preds, 1)
return np.argmax(probas)
def on_epoch_end(epoch, _): # 에폭이 끝날 때마다 실행
print('\n----- Generating text after Epoch: %d' % epoch)
start_index = random.randint(0, len(text) - maxlen - 1) # 랜덤으로 40개의 시드를 생성
# for diversity in [0.2, 0.5, 1.0, 1.2]:
# print('----- diversity:', diversity)
generated = ''
sentence = text[start_index: start_index + maxlen] # 힌트
generated += sentence
print('----- Generating with seed: "' + sentence + '"')
sys.stdout.write(generated)
for i in range(400): # 400개의 글자를 예측
x_pred = np.zeros((1, maxlen, len(chars)))
for t, char in enumerate(sentence):
x_pred[0, t, char_indices[char]] = 1.
preds = model.predict(x_pred, verbose=0)[0]
next_index = sample(preds, 0.5)
next_char = indices_char[next_index]
generated += next_char
sentence = sentence[1:] + next_char
sys.stdout.write(next_char)
sys.stdout.flush()
print()
print_callback = LambdaCallback(on_epoch_end=on_epoch_end) # 람다콜백에 등록
Train
model.fit(x, y, batch_size=128, epochs=60, callbacks=[print_callback])