www.wins.or.kr(IISPL)

커뮤니티

	게시판전체


		Python


		Python/TensorFlow


		NOTICE


		자유게시판


		강좌게시판


			WEB-PROG


			DP 알림판


			C.P.


			WebSC


			SR Board(V)


			OS전체


			COMPILER


			SSE


			DSP


			Multimedia


			C#


			Notice


			데이터로표현하는세상


		CMPLg RPT


		방명록


		자료모음


		LAB Board

Python/TensorFlow

제목: How to use Dataset in TensorFlow(tf.data) pipeline

1479 김윤중

TTS data pipline

class _dataset:
- def __init__(self,name,sess,ids): #['LJ001-0001','LJ001-0002'...]
  - self.name=name
  - self.sess=sess
  - self.transcript=np.loadtxt('./data/LJSpeech-1.1/transcript.csv',dtype=str,delimiter='|',encoding='utf-8')
  - #[['LJ001-0001','printing , ...'],[..]...]
  - self.id2text= {id:text for id,text in self.transcript[:,:2]} #{'LJ001-0001':'Printing,...','LJ001-0002':'Writing,..',...}
  - self.texts = [self.id2text[text] for text in ids] #['Printing,...','Writing,..',...]
  - self.ids = ids #['LJ001-0001','LJ001-0002' ...]
  - self.ch2id = {v:id for id,v in enumerate(pa.vocab)}
  - file_paths = ['./data/LJSpeech-1.1/wavs/{}.wav'.format(id) for id in self.ids]
  - self.create_tf_dataset(file_paths,self.ids,hp.BATCH_SIZE)
  - #self.text_code,self.decod_input, self.mel_spectro,self.linear_spectro
- def text_to_code(self,text):
  - text=pt.text_normalize(text)
  - return pt.transform_text_to_code(text,self.ch2id,pt.NB_CHARS_MAX)
- def codes(self,offset) :
  - code=np.asarray(self.text_to_code(self.texts[offset]))
  - return code
- def create_tf_dataset(self,file_paths,ids,batch_size) :
  - def parse_function(data_path, id):
    - def mel_linear_spectro_decod_input(data_path,id) : #'../wav/LJ001-0001.wav','LJ001-0001'
      - mel_spectro,linear_spectro,decoder_input=pa.padded_mel_linear_spectro_decod_input(data_path)
      - idstr=''.join([chr(b) for b in id]) #b'LJ001-0001'
      - encoder_input=self.text_to_code(self.id2text[idstr])
      - return [encoder_input,decoder_input, mel_spectro,linear_spectro]
    - y= tf.py_func(
             mel_linear_spectro_decod_input,       #함수이름
             [data_path, id],                             #파라메터
             [tf.int32,tf.float32,tf.float32,tf.float32]) #반환타입
         return y
  - dataset =tf.data.Dataset.from_tensor_slices((file_paths,ids))
  - dataset =dataset.map(parse_function, num_parallel_calls=4) #함수를 4개의 session으로 병렬처리
  - dataset =dataset.repeat().batch(batch_size) #배치보다작아지면 다시 채워서 반복
  - dataset =dataset..prefetch(1) #1 배치 씩 미리 준비한다.
  - iter=tf.data.Iterator.from_structure(
        (tf.int32,tf.float32,tf.float32,tf.float32),                                                             #output type
        ((batch_size, 200),(batch_size, 200, 80),(batch_size, 200, 400),(batch_size, 850, 513))) #output shape
        #[encoder_input,    decoder_input,       mel_spectro,            linear_spectro]
  - self.encoder_code,self.decoder_input, self.mel_spectro,self.linear_spectro=iter.get_next()
  - self.init_operation=iter.make_initializer(dataset)
- def init_op(self):
  - self.sess.run(self.init_operation)
사용예
- ds= _data('train', path_list,id_lis)
  ds.init_op()
  ds..encoder_code,ds..decoder_input, ds..mel_spectro,ds..linear_spectro
- model.train_on_bath([ds..encoder_code,ds..decoder_input,] [ds..mel_spectro,ds..linear_spectro])
def parse_function(filename, label):
- return [label,label], label
def get_dataset(filenames, labels,batch_size) :
- dataset = tf.data.Dataset.from_tensor_slices((filenames, labels))
  dataset = dataset.shuffle(len(filenames))
  dataset = dataset.map(parse_function, num_parallel_calls=4)
  dataset = dataset.batch(batch_size)
  dataset = dataset.prefetch(1)
  dataset = dataset.repeat()
  return dataset
filenames=['a','a','a','a','a','a','a','a','a','a']
labels =[1 ,2 ,3 ,4 ,5 ,6 ,7 ,8 ,9 ,10 ]
batch_size=4
dataset=get_dataset(filenames, labels,batch_size)
iterator = dataset.make_initializable_iterator()
next_element = iterator.get_next()
init_op = iterator.initializer
with tf.Session() as sess:
- sess.run(init_op)
  print(sess.run(next_element))
  print(sess.run(next_element))
  print(sess.run(next_element))
  print(sess.run(next_element))
"""
(array([[2, 2],       [9, 9],       [1, 1],       [8, 8]]), array([2, 9, 1, 8]))
(array([[ 6, 6],       [ 5, 5],       [10, 10],       [ 3, 3]]), array([ 6, 5, 10, 3]))
(array([[4, 4],       [7, 7]]), array([4, 7]))
(array([[4, 4],       [9, 9],       [3, 3],       [6, 6]]), array([4, 9, 3, 6]))
"""

The built-in Input Pipeline. Never use ‘feed-dict’ anymore

Updated to TensorFlow 1.8

A simple model using batching and switching between train and test dataset using a Initializable iterator

# Wrapping all together -> Switch between train and test set using Initializable iterator
EPOCHS = 10
# create a placeholder to dynamically switch between batch sizes
batch_size = tf.placeholder(tf.int64)
x, y = tf.placeholder(tf.float32, shape=[None,2]), tf.placeholder(tf.float32, shape=[None,1])
dataset = tf.data.Dataset.from_tensor_slices((x, y)).batch(batch_size).repeat()
features, labels = iter.get_next()
iter = dataset.make_initializable_iterator()
# using two numpy arrays
train_data = (np.random.sample((100,2)), np.random.sample((100,1)))
test_data = (np.random.sample((20,2)), np.random.sample((20,1)))
# make a simple model
net = tf.layers.dense(features, 8, activation=tf.tanh)
# pass the first value from iter.get_next() as input
net = tf.layers.dense(net, 8, activation=tf.tanh)
prediction = tf.layers.dense(net, 1, activation=tf.tanh)
loss = tf.losses.mean_squared_error(prediction, labels) # pass the second value from iter.get_net() as label
train_op = tf.train.AdamOptimizer().minimize(loss)
with tf.Session() as sess:
- sess.run(tf.global_variables_initializer())
- # initialise iterator with train data
- sess.run(iter.initializer, feed_dict={ x: train_data[0], y: train_data[1], batch_size: BATCH_SIZE})
- print('Training...')
- for i in range(EPOCHS):
  - tot_loss = 0
  - for _ in range(n_batches):
    - _, loss_value = sess.run([train_op, loss]) #no feed_dict
    - tot_loss += loss_value
  - print("Iter: {}, Loss: {:.4f}".format(i, tot_loss / n_batches))
- # initialise iterator with test data
- sess.run(iter.initializer, feed_dict={ x: test_data[0], y: test_data[1], batch_size: test_data[0].shape[0]})
- print('Test Loss: {:4f}'.format(sess.run(loss)))
Training...
Iter: 0, Loss: 0.2977
Iter: 1, Loss: 0.2152
Iter: 2, Loss: 0.1787
Iter: 3, Loss: 0.1597
Iter: 4, Loss: 0.1277
Iter: 5, Loss: 0.1334
Iter: 6, Loss: 0.1000
Iter: 7, Loss: 0.1154
Iter: 8, Loss: 0.0989
Iter: 9, Loss: 0.0948
Test Loss: 0.082150

A simple model using batching and switching between train and test dataset using a Initializable iterator

# Wrapping all together -> Switch between train and test set using Reinitializable iterator
EPOCHS = 10
# create a placeholder to dynamically switch between batch sizes
batch_size = tf.placeholder(tf.int64)
x, y = tf.placeholder(tf.float32, shape=[None,2]), tf.placeholder(tf.float32, shape=[None,1])
train_dataset = tf.data.Dataset.from_tensor_slices((x,y)).batch(batch_size).repeat()
test_dataset = tf.data.Dataset.from_tensor_slices((x,y)).batch(batch_size) # always batch even if you want to one shot it
# using two numpy arrays
train_data = (np.random.sample((100,2)), np.random.sample((100,1)))
test_data = (np.random.sample((20,2)), np.random.sample((20,1)))
# create a iterator of the correct shape and type
iter = tf.data.Iterator.from_structure(train_dataset.output_types, train_dataset.output_shapes)
features, labels = iter.get_next()
# create the initialisation operations
train_init_op = iter.make_initializer(train_dataset)
test_init_op = iter.make_initializer(test_dataset)
# make a simple model
net = tf.layers.dense(features, 8, activation=tf.tanh)
# pass the first value from iter.get_next() as input
net = tf.layers.dense(net, 8, activation=tf.tanh)
prediction = tf.layers.dense(net, 1, activation=tf.tanh)
loss = tf.losses.mean_squared_error(prediction, labels) # pass the second value from iter.get_net() as label
train_op = tf.train.AdamOptimizer().minimize(loss)
with tf.Session() as sess:
- sess.run(tf.global_variables_initializer())
- # initialise iterator with train data
- sess.run(train_init_op, feed_dict = {x : train_data[0], y: train_data[1], batch_size: 16})
- print('Training...')
- for i in range(EPOCHS):
  - tot_loss = 0
  - for _ in range(n_batches):
    - _, loss_value = sess.run([train_op, loss])
    - tot_loss += loss_value
  - print("Iter: {}, Loss: {:.4f}".format(i, tot_loss / n_batches))
- # initialise iterator with test data
- sess.run(test_init_op, feed_dict = {x : test_data[0], y: test_data[1], batch_size:len(test_data[0])})
- print('Test Loss: {:4f}'.format(sess.run(loss)))

references

How to use Dataset in TensorFlow
https://towardsdatascience.com/how-to-use-dataset-in-tensorflow-c758ef9e4428
TensorFlow dataset tutorial: https://www.tensorflow.org/programmers_guide/datasets

Dataset docs:
https://www.tensorflow.org/api_docs/python/tf/data/Dataset