1주차
각 단어(word)에 값(value)을 부여한다면?
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
sentences = [
'I love my dog',
'I love my cat'
]
tokenizer = Tokenizer(num_words=100)
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index
print(word_index)
- Tokenizer : sentence 로부터 vector 생성해줌
- num_words : top 100 words in volume
- dictionary 형태로 반환 : key가 word고 value가 그 단어의 토큰
- 'i' : 소문자화되었음.
아래와 같이 sentence를 수정한다면?
sentences = [
'I love my dog',
'I love my cat',
'You love my dog!'
]
- 'dog!' 은 그냥 'dog'와 다른 단어로 봐야 할까? -> no, punctuation은 strip out될 것임
from tensorflow.keras.preprocessing.text import Tokenizer
sentences = [
'I love my dog',
'I love my cat',
'You love my dog!',
'Do you think my dog is amazing?'
]
tokenizer = Tokenizer(num_words=100)
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index
sequences = tokenzier.texts_to_seqeunces(sentences)
print(word_index)
print(sequences)
train, test - 동일한 word set에 fit_on_text 해야 함 (당연함)
test_data = [
'i really love my dog',
'my dog loves my manatee'
]
test_seq = tokenizer.text_to_sequences(test_data)
print(test_seq)
처음 보는 단어들 섞여 있어서 그냥 그 자리 비어 있음
--> 거대한 training set이 필요하다
--> unseen word에 대한 특정 값을 두는 건 어떨까?
tokenizer = Tokenizer(num_words=100, oov_token="<OOV>")
oov_token='<OOV>' 추가 (out of vocab)
padding <- uniformity of size
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
sentences = [
'I love my dog',
'I love my cat',
'You love my dog!',
'Do you think my dog is amazing?'
]
tokenizer = Tokenizer(num_words=100, oov_token="<OOV>")
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index
sequences = tokenizer.texts_to_sequences(sentences)
padded = pad_sequences(sequences)
tokenizer로 sequence 생성한 뒤에 padding
0 를 넣어서 same length
padded = pad_sequences(
sequences,
padding='post',
truncating='post',
maxlen=5)
- padding ='post' : 뒤에를 패딩하기 (default는 'pre')
- maxlen = 5 : 최대 단어 갯수 5개로 overriding
- truncating = 'post' : maxlen에 맞추기 위해 단어 잘라낼 때 뒷부분 단어 잘라내기
sarcasm dataset 활용
import json
with open('sarcasm.json', 'r') as f:
datastore = json.load(f)
sentences = []
labels = []
urls = []
for item in datastore:
sentences.append(item['headline'])
labels.append(item['is_sarcastic'])
urls.append(item['artical_link'])
데이터 가져오기
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
tokenizer = Tokenizer(oov_token='<OOV>')
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index
sequences = tokenizer.texts_to_sequences(sentences)
padded = pad_sequences(sequences, padding='post')
print(padded[0])
print(padded.shape)
Tokenizer를 생성해서 fit_on_text 할 때, num_words를 100으로 설정해도 word_index 는 여전히 26709개
- sequence를 생성할 때에 top 100 개를 고려하게 됨 - top 100에 없는 단어는 <OOV>로 표시
- 'to', 'of' 와 같은 단어들은 fit_on_text 하기 전에 미리 처리해두는 것이 좋다
과제
import csv
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
with open("./data/bbc-text.csv", 'r') as csvfile:
print(f"First line (header) looks like this:\n\n{csvfile.readline()}")
print(f"Each data point looks like this:\n\n{csvfile.readline()}")
불용어 제거
# GRADED FUNCTION: remove_stopwords
def remove_stopwords(sentence):
"""
Removes a list of stopwords
Args:
sentence (string): sentence to remove the stopwords from
Returns:
sentence (string): lowercase sentence without the stopwords
"""
# List of stopwords
stopwords = ["a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "could", "did", "do", "does", "doing", "down", "during", "each", "few", "for", "from", "further", "had", "has", "have", "having", "he", "he'd", "he'll", "he's", "her", "here", "here's", "hers", "herself", "him", "himself", "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "it", "it's", "its", "itself", "let's", "me", "more", "most", "my", "myself", "nor", "of", "on", "once", "only", "or", "other", "ought", "our", "ours", "ourselves", "out", "over", "own", "same", "she", "she'd", "she'll", "she's", "should", "so", "some", "such", "than", "that", "that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll", "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "we", "we'd", "we'll", "we're", "we've", "were", "what", "what's", "when", "when's", "where", "where's", "which", "while", "who", "who's", "whom", "why", "why's", "with", "would", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves" ]
# Sentence converted to lowercase-only
sentence = sentence.lower()
### START CODE HERE
sentence_list = sentence.split(" ")
sentence_set = list(set(sentence_list) - set(stopwords))
sentence = [item for item in sentence_list if item in sentence_set]
sentence = ' '.join(sentence)
### END CODE HERE
return sentence
# GRADED FUNCTION: parse_data_from_file
def parse_data_from_file(filename):
"""
Extracts sentences and labels from a CSV file
Args:
filename (string): path to the CSV file
Returns:
sentences, labels (list of string, list of string): tuple containing lists of sentences and labels
"""
sentences = []
labels = []
with open(filename, 'r') as csvfile:
### START CODE HERE
reader = csv.reader(csvfile, delimiter=',')
for num, row in enumerate(reader):
if num == 0:
continue
labels.append(row[0])
sentences.append(remove_stopwords(row[1]))
### END CODE HERE
return sentences, labels
# Test your function
# With original dataset
sentences, labels = parse_data_from_file("./data/bbc-text.csv")
print("ORIGINAL DATASET:\n")
print(f"There are {len(sentences)} sentences in the dataset.\n")
print(f"First sentence has {len(sentences[0].split())} words (after removing stopwords).\n")
print(f"There are {len(labels)} labels in the dataset.\n")
print(f"The first 5 labels are {labels[:5]}\n\n")
# With a miniature version of the dataset that contains only first 5 rows
mini_sentences, mini_labels = parse_data_from_file("./data/bbc-text-minimal.csv")
print("MINIATURE DATASET:\n")
print(f"There are {len(mini_sentences)} sentences in the miniature dataset.\n")
print(f"First sentence has {len(mini_sentences[0].split())} words (after removing stopwords).\n")
print(f"There are {len(mini_labels)} labels in the miniature dataset.\n")
print(f"The first 5 labels are {mini_labels[:5]}")
# GRADED FUNCTION: fit_tokenizer
def fit_tokenizer(sentences):
"""
Instantiates the Tokenizer class
Args:
sentences (list): lower-cased sentences without stopwords
Returns:
tokenizer (object): an instance of the Tokenizer class containing the word-index dictionary
"""
### START CODE HERE
# Instantiate the Tokenizer class by passing in the oov_token argument
tokenizer = Tokenizer(oov_token='<OOV>')
# Fit on the sentences
tokenizer.fit_on_texts(sentences)
### END CODE HERE
return tokenizer
tokenizer = fit_tokenizer(sentences)
word_index = tokenizer.word_index
print(f"Vocabulary contains {len(word_index)} words\n")
print("<OOV> token included in vocabulary" if "<OOV>" in word_index else "<OOV> token NOT included in vocabulary")
# GRADED FUNCTION: get_padded_sequences
def get_padded_sequences(tokenizer, sentences):
"""
Generates an array of token sequences and pads them to the same length
Args:
tokenizer (object): Tokenizer instance containing the word-index dictionary
sentences (list of string): list of sentences to tokenize and pad
Returns:
padded_sequences (array of int): tokenized sentences padded to the same length
"""
### START CODE HERE
# Convert sentences to sequences
sequences = tokenizer.texts_to_sequences(sentences)
# Pad the sequences using the post padding strategy
padded_sequences = pad_sequences(sequences, padding='post')
### END CODE HERE
return padded_sequences
padded_sequences = get_padded_sequences(tokenizer, sentences)
print(f"First padded sequence looks like this: \n\n{padded_sequences[0]}\n")
print(f"Numpy array of all sequences has shape: {padded_sequences.shape}\n")
print(f"This means there are {padded_sequences.shape[0]} sequences in total and each one has a size of {padded_sequences.shape[1]}")
라벨 또한 tokenize
# GRADED FUNCTION: tokenize_labels
def tokenize_labels(labels):
"""
Tokenizes the labels
Args:
labels (list of string): labels to tokenize
Returns:
label_sequences, label_word_index (list of string, dictionary): tokenized labels and the word-index
"""
### START CODE HERE
# Instantiate the Tokenizer class
# No need to pass additional arguments since you will be tokenizing the labels
label_tokenizer = Tokenizer()
# Fit the tokenizer to the labels
label_tokenizer.fit_on_texts(labels)
# Save the word index
label_word_index = label_tokenizer.word_index
# Save the sequences
label_sequences = label_tokenizer.texts_to_sequences(labels)
### END CODE HERE
return label_sequences, label_word_index
label_sequences, label_word_index = tokenize_labels(labels)
print(f"Vocabulary of labels looks like this {label_word_index}\n")
print(f"First ten sequences {label_sequences[:10]}\n")
2주차
import numpy as np
import tensorflow_datasets as tfds
imdb, info = tfds.load('imdb_reviews' with_info=True, as_supervised=True)
train_data, test_data = imdb['train'], imdb['test']
tensor 오브젝트로 되어 있기 때문에 numpy()로 불러움
training_sentences = []
training_labels = []
testing_sentences = []
testing_labels = []
for s, l in train_data:
training_sentences.append(s.numpy().decode('utf8'))
training_labels.append(l.numpy())
for s, l in test_data:
testing_sentences.append(s.numpy().decode('utf8'))
testing_labels.append(l.numpy())
training_labels_final = np.array(training_labels)
testing_labels_final = np.array(testing_labels)
데이터 셋팅
vocab_size = 10000
embedding_dim = 16
max_length = 120
trunc_type = 'post'
oov_tok = '<OOV>'
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(training_sentences)
word_index = tokenizer.word_index
sequences = tokenizer.texts_to_sequences(training_sequences)
padded = pad_sequences(sequences, maxlen=max_length, truncating=trunc_type)
testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
testing_padded = pad_sequences(testing_sequences, maxlen=max_length)
- training set에서 가져온 word_index를 testing set에 적용한다고 생각하면 됨
model = tf.keras.Sequential([
tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
tf.keras.layers.Flatten(),
# tf.keras.layers.GlobalAveragePooling1D() 를 사용해도 괜찮음 (average across the vector)
tf.keras.layers.Dense(6, activation='relu'),
tf.kears.layers.Dense(1, activation='sigmoid')
])
the meaning of the words can come from the labelling of the dataset
의미가 비슷하면 - vector도 비슷하다
embedding의 결과 : 2D array - 문장 길이 x embedding dimension
-> 이미지에서 했던 대로 flatten 해야 함
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()
num_epochs = 10
model.fit(
padded,
training_labels_final,
epochs=num_epochs,
validation_data=(testing_padded, testing_labels_final)
)
visualize
e = model.layers[0]
weights = e.get_weights()[0]
print(weights.shape) # (vocab_size, embedding_dim)
(10000, 16)
reverse_word_index = tokenizer.index_word
token:word 로 된 dictionary 가져오기
import io
out_v = io.open('vecs.tsv', 'w', encoding='utf-8')
out_m = io.open('meta.tsv', 'w', encoding='utf-8')
for word_num in range(1, vocab_size):
word = reverse_word_index[word_num]
embeddings = weights[word_num]
out_m.write(word + "\n")
out_v.write('\t'.join([str(x) for x in embeddings]) + "\n")
out_v.close()
out_m.close()
-> co-efficient of each dimension on the vector for this word
projector.tensorflow.org 에서 시각화할 수 있음
sarcasm 데이터셋으로 해보겠음
import json
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
vocab_size = 10000
embedding_dim = 16
max_length = 32
trunc_type = 'post'
padding_type = 'post'
oov_tok = '<OOV>'
training_size = 20000 # 나머지는 validation에 사용
<sarcasm 데이터셋 다운로드 후>
with open('/tmp/sarcasm.json', 'r') as f:
datastore = json.load(f)
sentences = []
labels = []
for item in datastore:
sentences.append(item['headline'])
labels.append(item['is_sarcastic'])
training_sentences = sentences[0:training_size]
testing_sentences = sentences[training_size:]
training_labels = labels[0:training_size]
testing_labels = labels[training_size:]
split 했음
tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(training_sentences)
word_index = tokenizer.word_index
training_sequences = tokenizer.texts_to_sequences(training_sentences)
training_padded = pad_sequences(training_sequences,
maxlen=max_length,
padding=padding_type,
truncating=trunc_type)
testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
testing_padded = pad_sequences(testing_sequences,
maxlen=max_length,
padding=padding_type,
truncating=trunc_type)
sequence 화 + 패딩
model = tf.keras.Sequential([
tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
tf.keras.layers.GlobalAveragePooling1D(),
tf.keras.layers.Dense(24, activation='relu'),
tf.keras.layers.Dense(1, activation='sigmoid')
])
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
num_epochs = 30
history = model.fit(
training_padded,
training_labels,
epochs=num_epochs,
validation_data=(testing_padded, testing_labels),
verbose=2)
이렇게 했을 때 val_loss 상승하는 것 볼 수 있음. 왜?
하이퍼파라미터를 수정해보았을 때
vocab_size = 1000 # 수정
embedding_dim = 16
max_length = 16 # 수정
trunc_type = 'post'
padding_type = 'post'
oov_tok = '<OOV>'
training_size = 20000
imdb 데이터셋의 sub-word 버전으로 실행해보자
import tensorflow_datasets as tfds
imdb, info = tfds.load('imbd_reviews/subwords8k', with_info=True, as_supervised=True)
train_data, test_data = imdb['train'], imdb['test']
sub-words tokenizer에 접근
tokenizer = info.features['text'].encoder
print(tokenizer.subwords)
sample_string = 'Tensorflow, from basics to mastery'
tokenized_string = tokenizer.encode(sample_string)
print('Tokenized string is {}'.format(tokenized_string))
original_string = tokenzier.decode(tokenized_string)
print('The original string : {}'.format(original_string))
for ts in tokenized_string:
print('{} ---> {}'.format(ts, tokenizer.decode([ts])))
embedding_dim = 64
model = tf.keras.Sequential([
tf.keras.layers.Embedding(tokenizer.vocab_size, embedding_dim),
tf.keras.layers.GlobalAveragePooling1D(),
tf.keras.layers.Dense(6, activation='relu'),
tf.keras.layers.Dense(1, activation='sigmoid')
])
num_epochs = 10
model.compile(
loss='binary_crossentropy',
optimizer='adam',
metrics=['accuracy'])
history = model.fit(train_dataset, epochs = num_epochs, validation_data=test_data)
subwords는 sequence 형태로 두었을 때 의미를 가진다
과제
import io
import csv
import tensorflow as tf
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import matplotlib.pyplot as plt
NUM_WORDS = 1000
EMBEDDING_DIM = 16
MAXLEN = 120
PADDING = 'post'
OOV_TOKEN = "<OOV>"
TRAINING_SPLIT = .8
def remove_stopwords(sentence):
"""
Removes a list of stopwords
Args:
sentence (string): sentence to remove the stopwords from
Returns:
sentence (string): lowercase sentence without the stopwords
"""
# List of stopwords
stopwords = ["a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "could", "did", "do", "does", "doing", "down", "during", "each", "few", "for", "from", "further", "had", "has", "have", "having", "he", "he'd", "he'll", "he's", "her", "here", "here's", "hers", "herself", "him", "himself", "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "it", "it's", "its", "itself", "let's", "me", "more", "most", "my", "myself", "nor", "of", "on", "once", "only", "or", "other", "ought", "our", "ours", "ourselves", "out", "over", "own", "same", "she", "she'd", "she'll", "she's", "should", "so", "some", "such", "than", "that", "that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll", "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "we", "we'd", "we'll", "we're", "we've", "were", "what", "what's", "when", "when's", "where", "where's", "which", "while", "who", "who's", "whom", "why", "why's", "with", "would", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves" ]
# Sentence converted to lowercase-only
sentence = sentence.lower()
words = sentence.split()
no_words = [w for w in words if w not in stopwords]
sentence = " ".join(no_words)
return sentence
def parse_data_from_file(filename):
"""
Extracts sentences and labels from a CSV file
Args:
filename (string): path to the CSV file
Returns:
sentences, labels (list of string, list of string): tuple containing lists of sentences and labels
"""
sentences = []
labels = []
with open(filename, 'r') as csvfile:
reader = csv.reader(csvfile, delimiter=',')
next(reader)
for row in reader:
labels.append(row[0])
sentence = row[1]
sentence = remove_stopwords(sentence)
sentences.append(sentence)
return sentences, labels
# Test the functions
sentences, labels = parse_data_from_file("./bbc-text.csv")
print(f"There are {len(sentences)} sentences in the dataset.\n")
print(f"First sentence has {len(sentences[0].split())} words (after removing stopwords).\n")
print(f"There are {len(labels)} labels in the dataset.\n")
print(f"The first 5 labels are {labels[:5]}")
# GRADED FUNCTIONS: train_val_split
def train_val_split(sentences, labels, training_split):
"""
Splits the dataset into training and validation sets
Args:
sentences (list of string): lower-cased sentences without stopwords
labels (list of string): list of labels
training split (float): proportion of the dataset to convert to include in the train set
Returns:
train_sentences, validation_sentences, train_labels, validation_labels - lists containing the data splits
"""
### START CODE HERE
# Compute the number of sentences that will be used for training (should be an integer)
train_size = int(len(sentences) * training_split)
# Split the sentences and labels into train/validation splits
train_sentences = sentences[:train_size]
train_labels = labels[:train_size]
validation_sentences = sentences[train_size:]
validation_labels = labels[train_size:]
### END CODE HERE
return train_sentences, validation_sentences, train_labels, validation_labels
# Test your function
train_sentences, val_sentences, train_labels, val_labels = train_val_split(sentences, labels, TRAINING_SPLIT)
print(f"There are {len(train_sentences)} sentences for training.\n")
print(f"There are {len(train_labels)} labels for training.\n")
print(f"There are {len(val_sentences)} sentences for validation.\n")
print(f"There are {len(val_labels)} labels for validation.")
# GRADED FUNCTION: fit_tokenizer
def fit_tokenizer(train_sentences, num_words, oov_token):
"""
Instantiates the Tokenizer class on the training sentences
Args:
train_sentences (list of string): lower-cased sentences without stopwords to be used for training
num_words (int) - number of words to keep when tokenizing
oov_token (string) - symbol for the out-of-vocabulary token
Returns:
tokenizer (object): an instance of the Tokenizer class containing the word-index dictionary
"""
### START CODE HERE
# Instantiate the Tokenizer class, passing in the correct values for num_words and oov_token
tokenizer = Tokenizer(num_words =num_words, oov_token=oov_token)
# Fit the tokenizer to the training sentences
tokenizer.fit_on_texts(train_sentences)
### END CODE HERE
return tokenizer
# Test your function
tokenizer = fit_tokenizer(train_sentences, NUM_WORDS, OOV_TOKEN)
word_index = tokenizer.word_index
print(f"Vocabulary contains {len(word_index)} words\n")
print("<OOV> token included in vocabulary" if "<OOV>" in word_index else "<OOV> token NOT included in vocabulary")
# GRADED FUNCTION: seq_and_pad
def seq_and_pad(sentences, tokenizer, padding, maxlen):
"""
Generates an array of token sequences and pads them to the same length
Args:
sentences (list of string): list of sentences to tokenize and pad
tokenizer (object): Tokenizer instance containing the word-index dictionary
padding (string): type of padding to use
maxlen (int): maximum length of the token sequence
Returns:
padded_sequences (array of int): tokenized sentences padded to the same length
"""
### START CODE HERE
# Convert sentences to sequences
sequences = tokenizer.texts_to_sequences(sentences)
# Pad the sequences using the correct padding and maxlen
padded_sequences = pad_sequences(sequences, maxlen=maxlen, padding=padding)
### END CODE HERE
return padded_sequences
# Test your function
train_padded_seq = seq_and_pad(train_sentences, tokenizer, PADDING, MAXLEN)
val_padded_seq = seq_and_pad(val_sentences, tokenizer, PADDING, MAXLEN)
print(f"Padded training sequences have shape: {train_padded_seq.shape}\n")
print(f"Padded validation sequences have shape: {val_padded_seq.shape}")
# GRADED FUNCTION: tokenize_labels
def tokenize_labels(all_labels, split_labels):
"""
Tokenizes the labels
Args:
all_labels (list of string): labels to generate the word-index from
split_labels (list of string): labels to tokenize
Returns:
label_seq_np (array of int): tokenized labels
"""
### START CODE HERE
# Instantiate the Tokenizer (no additional arguments needed)
label_tokenizer = Tokenizer()
# Fit the tokenizer on all the labels
label_tokenizer.fit_on_texts(all_labels)
# Convert labels to sequences
label_seq = label_tokenizer.texts_to_sequences(split_labels)
# Convert sequences to a numpy array. Don't forget to substact 1 from every entry in the array!
label_seq_np = np.array(label_seq)- 1
### END CODE HERE
return label_seq_np
# Test your function
train_label_seq = tokenize_labels(labels, train_labels)
val_label_seq = tokenize_labels(labels, val_labels)
print(f"First 5 labels of the training set should look like this:\n{train_label_seq[:5]}\n")
print(f"First 5 labels of the validation set should look like this:\n{val_label_seq[:5]}\n")
print(f"Tokenized labels of the training set have shape: {train_label_seq.shape}\n")
print(f"Tokenized labels of the validation set have shape: {val_label_seq.shape}\n")
# GRADED FUNCTION: create_model
def create_model(num_words, embedding_dim, maxlen):
"""
Creates a text classifier model
Args:
num_words (int): size of the vocabulary for the Embedding layer input
embedding_dim (int): dimensionality of the Embedding layer output
maxlen (int): length of the input sequences
Returns:
model (tf.keras Model): the text classifier model
"""
tf.random.set_seed(123)
### START CODE HERE
model = tf.keras.Sequential([
tf.keras.layers.Embedding(num_words, embedding_dim, input_length=maxlen),
tf.keras.layers.GlobalAveragePooling1D(),
tf.keras.layers.Dense(6, activation='relu'),
tf.keras.layers.Dense(5, activation='softmax')
])
model.compile(loss='sparse_categorical_crossentropy',
optimizer='adam',
metrics=['accuracy'])
### END CODE HERE
return model
model = create_model(NUM_WORDS, EMBEDDING_DIM, MAXLEN)
history = model.fit(train_padded_seq, train_label_seq, epochs=30, validation_data=(val_padded_seq, val_label_seq))
'인공지능 > tensorflow certificate' 카테고리의 다른 글
Sequences, Time Series, and Prediction(2) - Deep Learning (0) | 2022.08.22 |
---|---|
Sequences, Time Series, and Prediction(1) - mathematical method (0) | 2022.08.22 |
Natural Language Processing in TensorFlow(2) (0) | 2022.08.21 |
Convolutional Neural Networks in TensorFlow (0) | 2022.08.19 |
Introduction to TensorFlow for Artificial Intelligence, ML, and DL (0) | 2022.08.18 |