![article thumbnail image](https://blog.kakaocdn.net/dn/obYuz/btrJ7nScR95/E0Dt3Rg7m46QM9tQ1n1S70/img.png)
3주차
model = tf.keras.Sequential([
tf.keras.layers.Embedding(tokneizer.vocab_size, 64),
tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
tf.keras.layers.Dense(64, activation='relu'),
tf.keras.layers.Dense(1, activation='sigmoid')
])
- tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64))
- 64 : 해당 레이어로부터 원하는 output size
- Bidirectional : cell state가 bi-directional 하게 움직인다
- bidirectional 이기 때문에 64 * 2 = 128
LSTM을 stack하는 것도 가능하다
model = tf.keras.Sequential([
tf.keras.layers.Embedding(tokenizer.vocab_size, 64),
tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True)),
tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
tf.keras.layers.Dense(64, activation='relu'),
tf.keras.layers.Dense(1, activation='sigmoid')
])
When you feed an LSTM into another one, you do have to put the return sequences = True parameter into the first one. This ensures that the outputs of the LSTM match the desired inputs of the next one.
basic neural network 구조에서 시작해보자
model = tf.keras.Sequential([
tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length = max_length),
tf.keras.layers.GlobalAveragePooling1D(),
tf.keras.layers.Dense(24, activation='relu'),
tf.keras.layers.Dense(1, activation='sigmoid')
])
GlobalAveragePooling 부분을 바꿔보는 건?
model = tf.keras.Sequential([
tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length = max_length),
tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
tf.keras.layers.Dense(24, activation='relu'),
tf.keras.layers.Dense(1, activation='sigmoid')
])
후자가 accuracy 가 더 높음 + 오버피팅 나타남
model = tf.keras.Sequential([
tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
tf.keras.layers.Conv1D(128, 5, activation='relu'),
tf.keras.layers.GlobaleMaxPooling1D(),
tf.keras.layers.Dense(24, activation='relu'),
tf.keras.layers.Dense(1, activation='sigmoid')
])
- 128개 필터, 5 words
다시 imdb 예시로
imdb, info = tfds.load('imdb_reviews', with_info=True, as_supervised=True)
model = tf.keras.Sequential([
tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length= max_length),
tf.keras.layers.Flatten(),
tf.keras.layers.Dense(6, activation='relu'),
tf.keras.layers.Dense(1, activation='sigmoid')
])
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()
imdb, info = tfds.load('imdb_reviews', with_info=True, as_supervised=True)
model = tf.keras.Sequential([
tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length= max_length),
tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
tf.keras.layers.Dense(6, activation='relu'),
tf.keras.layers.Dense(1, activation='sigmoid')
])
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()
imdb, info = tfds.load('imdb_reviews', with_info=True, as_supervised=True)
model = tf.keras.Sequential([
tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length= max_length),
tf.keras.layers.Bidirectional(tf.keras.layers.GRU(32)),
tf.keras.layers.Dense(6, activation='relu'),
tf.keras.layers.Dense(1, activation='sigmoid')
])
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()
model = tf.keras.Sequential([
tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
tf.keras.layers.Conv1D(128, 5, activation='relu'),
tf.keras.layers.GlobalAveragePooling1D(),
tf.keras.layers.Dense(6, activation='relu'),
tf.keras.layers.Dense(1, activation='sigmoid')
])
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()
- text의 경우 test data에 반드시 out of vocab 단어가 있기 때문에 image보다 오버피팅 이슈가 있다
과제
import csv
import random
import pickle
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import matplotlib.pyplot as plt
from scipy.stats import linregress
EMBEDDING_DIM = 100
MAXLEN = 16
TRUNCATING = 'post'
PADDING = 'post'
OOV_TOKEN = "<OOV>"
MAX_EXAMPLES = 160000
TRAINING_SPLIT = 0.9
SENTIMENT_CSV = "./data/training_cleaned.csv"
def parse_data_from_file(filename):
"""
Extracts sentences and labels from a CSV file
Args:
filename (string): path to the CSV file
Returns:
sentences, labels (list of string, list of string): tuple containing lists of sentences and labels
"""
sentences = []
labels = []
with open(filename, 'r') as csvfile:
### START CODE HERE
reader = csv.reader(csvfile, delimiter=',')
for row in reader:
sentences.append(row[5])
if row[0] == '4':
labels.append(1)
else:
labels.append(0)
### END CODE HERE111
return sentences, labels
# Test your function
sentences, labels = parse_data_from_file(SENTIMENT_CSV)
print(f"dataset contains {len(sentences)} examples\n")
print(f"Text of second example should look like this:\n{sentences[1]}\n")
print(f"Text of fourth example should look like this:\n{sentences[3]}")
print(f"\nLabels of last 5 examples should look like this:\n{labels[-5:]}")
# Bundle the two lists into a single one
sentences_and_labels = list(zip(sentences, labels))
# Perform random sampling
random.seed(42)
sentences_and_labels = random.sample(sentences_and_labels, MAX_EXAMPLES)
# Unpack back into separate lists
sentences, labels = zip(*sentences_and_labels)
print(f"There are {len(sentences)} sentences and {len(labels)} labels after random sampling\n")
def train_val_split(sentences, labels, training_split):
"""
Splits the dataset into training and validation sets
Args:
sentences (list of string): lower-cased sentences without stopwords
labels (list of string): list of labels
training split (float): proportion of the dataset to convert to include in the train set
Returns:
train_sentences, validation_sentences, train_labels, validation_labels - lists containing the data splits
"""
### START CODE HERE
# Compute the number of sentences that will be used for training (should be an integer)
train_size = int(len(sentences) * training_split)
# Split the sentences and labels into train/validation splits
train_sentences = sentences[:train_size]
train_labels = labels[:train_size]
validation_sentences = sentences[train_size:]
validation_labels = labels[train_size:]
### END CODE HERE
return train_sentences, validation_sentences, train_labels, validation_labels
# Test your function
train_sentences, val_sentences, train_labels, val_labels = train_val_split(sentences, labels, TRAINING_SPLIT)
print(f"There are {len(train_sentences)} sentences for training.\n")
print(f"There are {len(train_labels)} labels for training.\n")
print(f"There are {len(val_sentences)} sentences for validation.\n")
print(f"There are {len(val_labels)} labels for validation.")
def fit_tokenizer(train_sentences, oov_token):
"""
Instantiates the Tokenizer class on the training sentences
Args:
train_sentences (list of string): lower-cased sentences without stopwords to be used for training
oov_token (string) - symbol for the out-of-vocabulary token
Returns:
tokenizer (object): an instance of the Tokenizer class containing the word-index dictionary
"""
### START CODE HERE
# Instantiate the Tokenizer class, passing in the correct value for oov_token
tokenizer = Tokenizer(oov_token=oov_token)
# Fit the tokenizer to the training sentences
tokenizer.fit_on_texts(train_sentences)
### END CODE HERE
return tokenizer
# Test your function
tokenizer = fit_tokenizer(train_sentences, OOV_TOKEN)
word_index = tokenizer.word_index
VOCAB_SIZE = len(word_index)
print(f"Vocabulary contains {VOCAB_SIZE} words\n")
print("<OOV> token included in vocabulary" if "<OOV>" in word_index else "<OOV> token NOT included in vocabulary")
print(f"\nindex of word 'i' should be {word_index['i']}")
def seq_pad_and_trunc(sentences, tokenizer, padding, truncating, maxlen):
"""
Generates an array of token sequences and pads them to the same length
Args:
sentences (list of string): list of sentences to tokenize and pad
tokenizer (object): Tokenizer instance containing the word-index dictionary
padding (string): type of padding to use
truncating (string): type of truncating to use
maxlen (int): maximum length of the token sequence
Returns:
pad_trunc_sequences (array of int): tokenized sentences padded to the same length
"""
### START CODE HERE
# Convert sentences to sequences
sequences = tokenizer.texts_to_sequences(sentences)
# Pad the sequences using the correct padding, truncating and maxlen
pad_trunc_sequences = pad_sequences(sequences, padding=padding, truncating=truncating, maxlen=maxlen)
### END CODE HERE
return pad_trunc_sequences
# Test your function
train_pad_trunc_seq = seq_pad_and_trunc(train_sentences, tokenizer, PADDING, TRUNCATING, MAXLEN)
val_pad_trunc_seq = seq_pad_and_trunc(val_sentences, tokenizer, PADDING, TRUNCATING, MAXLEN)
print(f"Padded and truncated training sequences have shape: {train_pad_trunc_seq.shape}\n")
print(f"Padded and truncated validation sequences have shape: {val_pad_trunc_seq.shape}")
train_labels = np.array(train_labels)
val_labels = np.array(val_labels)
using pre-defined embeddings - from GloVe
# Define path to file containing the embeddings
GLOVE_FILE = './data/glove.6B.100d.txt'
# Initialize an empty embeddings index dictionary
GLOVE_EMBEDDINGS = {}
# Read file and fill GLOVE_EMBEDDINGS with its contents
with open(GLOVE_FILE) as f:
for line in f:
values = line.split()
word = values[0]
coefs = np.asarray(values[1:], dtype='float32')
GLOVE_EMBEDDINGS[word] = coefs
test_word = 'dog'
test_vector = GLOVE_EMBEDDINGS[test_word]
print(f"Vector representation of word {test_word} looks like this:\n\n{test_vector}")
print(f"Each word vector has shape: {test_vector.shape}")
# Initialize an empty numpy array with the appropriate size
EMBEDDINGS_MATRIX = np.zeros((VOCAB_SIZE+1, EMBEDDING_DIM))
# Iterate all of the words in the vocabulary and if the vector representation for
# each word exists within GloVe's representations, save it in the EMBEDDINGS_MATRIX array
for word, i in word_index.items():
embedding_vector = GLOVE_EMBEDDINGS.get(word)
if embedding_vector is not None:
EMBEDDINGS_MATRIX[i] = embedding_vector
define model that doesn't overfit
# GRADED FUNCTION: create_model
def create_model(vocab_size, embedding_dim, maxlen, embeddings_matrix):
"""
Creates a binary sentiment classifier model
Args:
vocab_size (int): size of the vocabulary for the Embedding layer input
embedding_dim (int): dimensionality of the Embedding layer output
maxlen (int): length of the input sequences
embeddings_matrix (array): predefined weights of the embeddings
Returns:
model (tf.keras Model): the sentiment classifier model
"""
### START CODE HERE
model = tf.keras.Sequential([
# This is how you need to set the Embedding layer when using pre-trained embeddings
tf.keras.layers.Embedding(vocab_size+1, embedding_dim, input_length=maxlen, weights=[embeddings_matrix], trainable=False),
tf.keras.layers.Conv1D(32, 5, activation='relu'),
tf.keras.layers.GlobalMaxPooling1D(),
tf.keras.layers.Dropout(0.2),
tf.keras.layers.Dense(32, activation='relu'),
tf.keras.layers.Dense(1, activation='sigmoid')
])
model.compile(loss='binary_crossentropy',
optimizer='adam',
metrics=['accuracy'])
### END CODE HERE
return model
# Create your untrained model
model = create_model(VOCAB_SIZE, EMBEDDING_DIM, MAXLEN, EMBEDDINGS_MATRIX)
# Train the model and save the training history
history = model.fit(train_pad_trunc_seq, train_labels, epochs=20, validation_data=(val_pad_trunc_seq, val_labels))
4주차
tokenizer = Tokenizer()
data = "In the town of Athy one Jeremy Lanigan \n Battered away ... ..."
corpus = data.lower().split('\n')
tokenizer.fit_on_texts(corpus)
total_words = len(tokenizer.word_index) + 1
- OOV 고려하여 1 더함
input_sequences = []
for line in corpus:
token_list = tokenizer.texts_to_sequencese([line])[0]
for i in range(1, len(token_list)):
n_gram_sequence = token_list[:i+1]
input_sequences.append(n_gram_sequence)
max_sequence_len = max([len(x) for x in input_sequences])
제일 길이가 긴 시퀀스 찾기
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))
패딩
input sequence 마다 input x와 label y 를 생성 (마지막 토큰으로 갖다가..)
<-- 라벨링 하기 쉬우라고 padding을 pre로 설정했던 것임
xs = input_sequences[:, :-1]
labels = input_sequences[:, -1]
결국에는 classification 문제이기 때문에 라벨을 one-hot encoding 해줘야 함
(list --> categorical)
ys = tf.keras.utils.to_categorical(labels, num_classes=total_words)
모델 정의 및 훈련
model = Sequential()
model.add(Embedding(total_words, 64, input_length=max_sequence_len - 1))
model.add(Bidirectional(LSTM(20)))
model.add(Dense(total_words, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(xs, ys, epochs=500, verbose=1)
- max_sequence_len - 1 : 마지막 토큰은 라벨로 사용했으므로 길이에서 하나를 뺀다
다음 10개 단어 예측
seed_text = "Laurence went to dublin"
next_words = 10
for _ in range(next_words):
token_list = tokenizer.texts_to_sequence([seed_text])[0]
token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
predicted = model.predict_classes(token_list, verbose=0)
# probabilites = model.predict(token_list)
# predicted = np.argmax(probabilities, axis=-1)[0]
# Ignore if index is 0 because that is just the padding
# if predicted != 0:
output_word = tokenizer.index_word[predicted]
seed_text += " " + output_word
<더 큰 데이터셋으로 학습해보기>
data = open('/tmp/irish-lyrics-eof.txt').read()
더 커진 데이터셋에 맞게 조정된 하이퍼파라미터
model = Sequential()
model.add(Embedding(total_words, 100, input_length=max_sequence_len-1)
model.add(Bidirectional(LSTM(150)))
model.add(Dense(total_words, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer=Adam(lr=0.01), metrics=['accuracy'])
history = model.fit(xs, ys, epochs=100, verbose=1)
- embedding 차원, LSTM 유닛, 옵티마이저
만약 데이터셋이 아주아주 커진다면 (셰익스피어의 모든 작품이라든가) 원핫으로 표현되는 라벨부터 엄청나게 거대해질 것임
-> character level로 해보시길
과제
import numpy as np
import matplotlib.pyplot as plt
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional
SONNETS_FILE = './sonnets.txt'
# Read the data
with open('./sonnets.txt') as f:
data = f.read()
# Convert to lower case and save as a list
corpus = data.lower().split("\n")
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)
total_words = len(tokenizer.word_index) + 1
# GRADED FUNCTION: n_gram_seqs
def n_gram_seqs(corpus, tokenizer):
"""
Generates a list of n-gram sequences
Args:
corpus (list of string): lines of texts to generate n-grams for
tokenizer (object): an instance of the Tokenizer class containing the word-index dictionary
Returns:
input_sequences (list of int): the n-gram sequences for each line in the corpus
"""
input_sequences = []
### START CODE HERE
for line in corpus:
seq = tokenizer.texts_to_sequences([line])[0]
for n in range(1, len(seq)):
input_sequences.append(seq[:1+n])
### END CODE HERE
return input_sequences
# Test your function with one example
first_example_sequence = n_gram_seqs([corpus[0]], tokenizer)
print("n_gram sequences for first example look like this:\n")
first_example_sequence
# Apply the n_gram_seqs transformation to the whole corpus
input_sequences = n_gram_seqs(corpus, tokenizer)
# Save max length
max_sequence_len = max([len(x) for x in input_sequences])
print(f"n_grams of input_sequences have length: {len(input_sequences)}")
print(f"maximum length of sequences is: {max_sequence_len}")
# GRADED FUNCTION: pad_seqs
def pad_seqs(input_sequences, maxlen):
"""
Pads tokenized sequences to the same length
Args:
input_sequences (list of int): tokenized sequences to pad
maxlen (int): maximum length of the token sequences
Returns:
padded_sequences (array of int): tokenized sequences padded to the same length
"""
### START CODE HERE
padded_sequences = np.array(pad_sequences(input_sequences, maxlen=maxlen))
return padded_sequences
### END CODE HERE
# Test your function with the n_grams_seq of the first example
first_padded_seq = pad_seqs(first_example_sequence, len(first_example_sequence))
first_padded_seq
# Pad the whole corpus
input_sequences = pad_seqs(input_sequences, max_sequence_len)
print(f"padded corpus has shape: {input_sequences.shape}")
# GRADED FUNCTION: features_and_labels
def features_and_labels(input_sequences, total_words):
"""
Generates features and labels from n-grams
Args:
input_sequences (list of int): sequences to split features and labels from
total_words (int): vocabulary size
Returns:
features, one_hot_labels (array of int, array of int): arrays of features and one-hot encoded labels
"""
### START CODE HERE
features = input_sequences[:, :-1]
labels = input_sequences[:, -1]
one_hot_labels = to_categorical(labels, num_classes=total_words)
### END CODE HERE
return features, one_hot_labels
# Split the whole corpus
features, labels = features_and_labels(input_sequences, total_words)
print(f"features have shape: {features.shape}")
print(f"labels have shape: {labels.shape}")
# GRADED FUNCTION: create_model
def create_model(total_words, max_sequence_len):
"""
Creates a text generator model
Args:
total_words (int): size of the vocabulary for the Embedding layer input
max_sequence_len (int): length of the input sequences
Returns:
model (tf.keras Model): the text generator model
"""
model = Sequential()
### START CODE HERE
model.add(Embedding(total_words, 100, input_length=max_sequence_len - 1))
model.add(Bidirectional(LSTM(256)))
model.add(Dense(total_words, activation='softmax'))
# Compile the model
model.compile(loss='categorical_crossentropy',
optimizer='adam',
metrics=['accuracy'])
### END CODE HERE
return model
# Get the untrained model
model = create_model(total_words, max_sequence_len)
# Train the model
history = model.fit(features, labels, epochs=50, verbose=1)
prediction
seed_text = "Help me Obi Wan Kenobi, you're my only hope"
next_words = 100
for _ in range(next_words):
# Convert the text into sequences
token_list = tokenizer.texts_to_sequences([seed_text])[0]
# Pad the sequences
token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
# Get the probabilities of predicting a word
predicted = model.predict(token_list, verbose=0)
# Choose the next word based on the maximum probability
predicted = np.argmax(predicted, axis=-1).item()
# Get the actual word from the word index
output_word = tokenizer.index_word[predicted]
# Append to the current text
seed_text += " " + output_word
print(seed_text)
'인공지능 > tensorflow certificate' 카테고리의 다른 글
Sequences, Time Series, and Prediction(2) - Deep Learning (0) | 2022.08.22 |
---|---|
Sequences, Time Series, and Prediction(1) - mathematical method (0) | 2022.08.22 |
Natural Language Processing in TensorFlow(1) (0) | 2022.08.21 |
Convolutional Neural Networks in TensorFlow (0) | 2022.08.19 |
Introduction to TensorFlow for Artificial Intelligence, ML, and DL (0) | 2022.08.18 |