—-
이탤릭 볼드 이탤릭볼드
Workflow stages
- Question or problem definition.
- Acquire training and testing data.
- Wrangle, prepare, cleanse the data.
- Analyze, identify patterns, and explore the data.
- Model, predict and solve the problem.
- Visualize, report, and present the problem solving steps and final solution.
- Supply or submit the results.
기본적으로 설치되어 있어야하는 패키지는 아래 코드
를 사용한다.
import tensorflow as tf
from google.colab import auth, drive
GSC 버킷 접근
auth.authenticate_user()
TPU 위치 찾기
# configure logging
log = logging.getLogger('tensorflow')
log.setLevel(logging.INFO)
# create formatter and add it to the handlers
formatter = logging.Formatter('%(asctime)s : %(message)s')
sh = logging.StreamHandler()
sh.setLevel(logging.INFO)
sh.setFormatter(formatter)
log.handlers = [sh]
if 'COLAB_TPU_ADDR' in os.environ:
log.info("Using TPU runtime")
USE_TPU = True
TPU_ADDRESS = 'grpc://' + os.environ['COLAB_TPU_ADDR']
with tf.Session(TPU_ADDRESS) as session:
log.info('TPU address is ' + TPU_ADDRESS)
# Upload credentials to TPU.
with open('/content/adc.json', 'r') as f:
auth_info = json.load(f)
tf.contrib.cloud.configure_gcs(session, credentials=auth_info)
else:
log.warning('Not connected to TPU runtime')
USE_TPU = False
(옵션) 일부만 사용해 학습하기
DEMO_MODE = True #@param {type:"boolean"}
if DEMO_MODE:
CORPUS_SIZE = 1000000
else:
CORPUS_SIZE = 100000000 #@param {type: "integer"}
!(head -n $CORPUS_SIZE dataset.txt) > subdataset.txt
!mv subdataset.txt dataset.txt
GCP 버킷에 모델 & 학습 데이터 올리기
BUCKET_NAME = "이부분을_수정해_주세요" #@param {type:"string"}
MODEL_DIR = "bert_model" #@param {type:"string"}
tf.gfile.MkDir(MODEL_DIR)
모델 학습 Hyper Parameters 설정하기
# Colab의 TPU는 v3-8이므로 NUM_TPU_CORES는 8Core가 최대다.
BUCKET_NAME = "user-blog-sample" #@param {type:"string"}
MODEL_DIR = "user_model" #@param {type:"string"}
PRETRAINING_DIR = "pretraining_data" #@param {type:"string"}
VOC_FNAME = "vocab.txt" #@param {type:"string"}
# Input data pipeline config
TRAIN_BATCH_SIZE = 128 #@param {type:"integer"}
MAX_PREDICTIONS = 20 #@param {type:"integer"}
MAX_SEQ_LENGTH = 128 #@param {type:"integer"}
MASKED_LM_PROB = 0.15 #@param
# Training procedure config
EVAL_BATCH_SIZE = 64
LEARNING_RATE = 2e-5
TRAIN_STEPS = 1000000 #@param {type:"integer"}
SAVE_CHECKPOINTS_STEPS = 2500 #@param {type:"integer"}
NUM_TPU_CORES = 8
if BUCKET_NAME:
BUCKET_PATH = "gs://{}".format(BUCKET_NAME)
else:
BUCKET_PATH = "."
BERT_GCS_DIR = "{}/{}".format(BUCKET_PATH, MODEL_DIR)
DATA_GCS_DIR = "{}/{}".format(BUCKET_PATH, PRETRAINING_DIR)
VOCAB_FILE = os.path.join(BERT_GCS_DIR, VOC_FNAME)
CONFIG_FILE = os.path.join(BERT_GCS_DIR, "bert_config.json")
INIT_CHECKPOINT = tf.train.latest_checkpoint(BERT_GCS_DIR)
bert_config = modeling.BertConfig.from_json_file(CONFIG_FILE)
input_files = tf.gfile.Glob(os.path.join(DATA_GCS_DIR,'*tfrecord'))
log.info("Using checkpoint: {}".format(INIT_CHECKPOINT))
log.info("Using {} data shards".format(len(input_files)))
모델을 TPU로 올리고 학습하기
model_fn = model_fn_builder(
bert_config=bert_config,
init_checkpoint=INIT_CHECKPOINT,
learning_rate=LEARNING_RATE,
num_train_steps=TRAIN_STEPS,
num_warmup_steps=10,
use_tpu=USE_TPU,
use_one_hot_embeddings=True)
tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(TPU_ADDRESS)
run_config = tf.contrib.tpu.RunConfig(
cluster=tpu_cluster_resolver,
model_dir=BERT_GCS_DIR,
save_checkpoints_steps=SAVE_CHECKPOINTS_STEPS,
tpu_config=tf.contrib.tpu.TPUConfig(
iterations_per_loop=SAVE_CHECKPOINTS_STEPS,
num_shards=NUM_TPU_CORES,
per_host_input_for_training=tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2))
estimator = tf.contrib.tpu.TPUEstimator(
use_tpu=USE_TPU,
model_fn=model_fn,
config=run_config,
train_batch_size=TRAIN_BATCH_SIZE,
eval_batch_size=EVAL_BATCH_SIZE)
train_input_fn = input_fn_builder(
input_files=input_files,
max_seq_length=MAX_SEQ_LENGTH,
max_predictions_per_seq=MAX_PREDICTIONS,
is_training=True)
# 학습하자!!
estimator.train(input_fn=train_input_fn, max_steps=TRAIN_STEPS)