커뮤니티
Python/TensorFlow
제목:    How to use Dataset in TensorFlow(tf.data) pipeline
  1381   김윤중

TTS data pipline

  • class _dataset:
    • def __init__(self,name,sess,ids): #['LJ001-0001','LJ001-0002'...]
      • self.name=name
      • self.sess=sess
      • self.transcript=np.loadtxt('./data/LJSpeech-1.1/transcript.csv',dtype=str,delimiter='|',encoding='utf-8')
      • #[['LJ001-0001','printing , ...'],[..]...]
      • self.id2text= {id:text for id,text in self.transcript[:,:2]} #{'LJ001-0001':'Printing,...','LJ001-0002':'Writing,..',...}
      • self.texts = [self.id2text[text] for text in ids]   #['Printing,...','Writing,..',...]
      • self.ids = ids          #['LJ001-0001','LJ001-0002' ...]
      • self.ch2id = {v:id for id,v in enumerate(pa.vocab)}
      • file_paths = ['./data/LJSpeech-1.1/wavs/{}.wav'.format(id) for id in self.ids]
      • self.create_tf_dataset(file_paths,self.ids,hp.BATCH_SIZE)
      • #self.text_code,self.decod_input, self.mel_spectro,self.linear_spectro
    • def text_to_code(self,text):
      • text=pt.text_normalize(text)
      • return pt.transform_text_to_code(text,self.ch2id,pt.NB_CHARS_MAX)
    • def codes(self,offset) :
      • code=np.asarray(self.text_to_code(self.texts[offset]))
      • return code
    • def create_tf_dataset(self,file_paths,ids,batch_size) :
      • def parse_function(data_path, id):
        • def mel_linear_spectro_decod_input(data_path,id) : #'../wav/LJ001-0001.wav','LJ001-0001'
          • mel_spectro,linear_spectro,decoder_input=pa.padded_mel_linear_spectro_decod_input(data_path)
          • idstr=''.join([chr(b) for b in id])    #b'LJ001-0001'
          • encoder_input=self.text_to_code(self.id2text[idstr])
          • return [encoder_input,decoder_input, mel_spectro,linear_spectro]
        • y= tf.py_func(
                 mel_linear_spectro_decod_input,       #함수이름
                 [data_path, id],                             #파라메터
                 [tf.int32,tf.float32,tf.float32,tf.float32]) #반환타입
             return y
      • dataset =tf.data.Dataset.from_tensor_slices((file_paths,ids))
      • dataset =dataset.map(parse_function, num_parallel_calls=4)  #함수를 4개의 session으로 병렬처리
      • dataset =dataset.repeat().batch(batch_size) #배치보다작아지면 다시 채워서 반복
      • dataset =dataset..prefetch(1)       #1 배치 씩 미리 준비한다.
      • iter=tf.data.Iterator.from_structure(
            (tf.int32,tf.float32,tf.float32,tf.float32),                                                             #output type
            ((batch_size, 200),(batch_size, 200, 80),(batch_size, 200, 400),(batch_size, 850, 513)))  #output shape
            #[encoder_input,    decoder_input,       mel_spectro,            linear_spectro]
      • self.encoder_code,self.decoder_input, self.mel_spectro,self.linear_spectro=iter.get_next()
      • self.init_operation=iter.make_initializer(dataset)
    • def init_op(self):
      • self.sess.run(self.init_operation)
  • 사용예
    • ds= _data('train', path_list,id_lis)
      ds.init_op()
      ds..encoder_code,ds..decoder_input, ds..mel_spectro,ds..linear_spectro
    • model.train_on_bath([ds..encoder_code,ds..decoder_input,] [ds..mel_spectro,ds..linear_spectro])
  •  
  •  
  • def parse_function(filename, label):
    • return [label,label], label
  • def get_dataset(filenames, labels,batch_size) :
    •  dataset = tf.data.Dataset.from_tensor_slices((filenames, labels))
       dataset = dataset.shuffle(len(filenames))
       dataset = dataset.map(parse_function, num_parallel_calls=4)
       dataset = dataset.batch(batch_size)
       dataset = dataset.prefetch(1)
       dataset = dataset.repeat()
       return dataset
  • filenames=['a','a','a','a','a','a','a','a','a','a']
    labels  =[1  ,2  ,3  ,4  ,5  ,6  ,7  ,8  ,9  ,10  ]
    batch_size=4
  • dataset=get_dataset(filenames, labels,batch_size)
    iterator = dataset.make_initializable_iterator()
    next_element = iterator.get_next()
    init_op = iterator.initializer
  • with tf.Session() as sess:
    • sess.run(init_op)
      print(sess.run(next_element))
      print(sess.run(next_element))
      print(sess.run(next_element))
      print(sess.run(next_element))
  • """
    (array([[2, 2],       [9, 9],       [1, 1],       [8, 8]]), array([2, 9, 1, 8]))
    (array([[ 6,  6],       [ 5,  5],       [10, 10],       [ 3,  3]]), array([ 6,  5, 10,  3]))
    (array([[4, 4],       [7, 7]]), array([4, 7]))
    (array([[4, 4],       [9, 9],       [3, 3],       [6, 6]]), array([4, 9, 3, 6]))
    """

 

The built-in Input Pipeline. Never use ‘feed-dict’ anymore

Updated to TensorFlow 1.8

A simple model using batching and switching between train and test dataset using a Initializable iterator

  • # Wrapping all together -> Switch between train and test set using Initializable iterator
  • EPOCHS = 10
     
  • # create a placeholder to dynamically switch between batch sizes
  • batch_size = tf.placeholder(tf.int64)
  • x, y      = tf.placeholder(tf.float32, shape=[None,2]), tf.placeholder(tf.float32, shape=[None,1])
  • dataset = tf.data.Dataset.from_tensor_slices((x, y)).batch(batch_size).repeat()
  • features, labels = iter.get_next()
  • iter = dataset.make_initializable_iterator()
     
  • # using two numpy arrays
  • train_data = (np.random.sample((100,2)), np.random.sample((100,1)))
  • test_data  = (np.random.sample((20,2)),  np.random.sample((20,1)))
     
  • # make a simple model
  • net = tf.layers.dense(features, 8, activation=tf.tanh)
  • # pass the first value from iter.get_next() as input
  • net = tf.layers.dense(net, 8, activation=tf.tanh)
  • prediction = tf.layers.dense(net, 1, activation=tf.tanh)
  • loss = tf.losses.mean_squared_error(prediction, labels) # pass the second value from iter.get_net() as label
  • train_op = tf.train.AdamOptimizer().minimize(loss)
     
  • with tf.Session() as sess:
    • sess.run(tf.global_variables_initializer())
    • # initialise iterator with train data
    • sess.run(iter.initializer, feed_dict={ x: train_data[0], y: train_data[1], batch_size: BATCH_SIZE})
    • print('Training...')
    • for i in range(EPOCHS):
      • tot_loss = 0
      • for _ in range(n_batches):
        • _, loss_value = sess.run([train_op, loss]) #no feed_dict
        • tot_loss += loss_value
      • print("Iter: {}, Loss: {:.4f}".format(i, tot_loss / n_batches))
    • # initialise iterator with test data
    • sess.run(iter.initializer, feed_dict={ x: test_data[0], y: test_data[1], batch_size: test_data[0].shape[0]})
    • print('Test Loss: {:4f}'.format(sess.run(loss)))
  • Training...
    Iter: 0, Loss: 0.2977
    Iter: 1, Loss: 0.2152
    Iter: 2, Loss: 0.1787
    Iter: 3, Loss: 0.1597
    Iter: 4, Loss: 0.1277
    Iter: 5, Loss: 0.1334
    Iter: 6, Loss: 0.1000
    Iter: 7, Loss: 0.1154
    Iter: 8, Loss: 0.0989
    Iter: 9, Loss: 0.0948
    Test Loss: 0.082150

 

A simple model using batching and switching between train and test dataset using a Initializable iterator

  • # Wrapping all together -> Switch between train and test set using Reinitializable iterator
  • EPOCHS = 10
     
  • # create a placeholder to dynamically switch between batch sizes
  • batch_size = tf.placeholder(tf.int64)
  • x, y = tf.placeholder(tf.float32, shape=[None,2]), tf.placeholder(tf.float32, shape=[None,1])
  • train_dataset = tf.data.Dataset.from_tensor_slices((x,y)).batch(batch_size).repeat()
  • test_dataset = tf.data.Dataset.from_tensor_slices((x,y)).batch(batch_size) # always batch even if you want to one shot it
     
  • # using two numpy arrays
  • train_data = (np.random.sample((100,2)), np.random.sample((100,1)))
  • test_data = (np.random.sample((20,2)), np.random.sample((20,1)))
     
  • # create a iterator of the correct shape and type
  • iter = tf.data.Iterator.from_structure(train_dataset.output_types, train_dataset.output_shapes)
  • features, labels = iter.get_next()
  • # create the initialisation operations
  • train_init_op = iter.make_initializer(train_dataset)
  • test_init_op = iter.make_initializer(test_dataset)
     
  • # make a simple model
  • net = tf.layers.dense(features, 8, activation=tf.tanh)
  • # pass the first value from iter.get_next() as input
  • net = tf.layers.dense(net, 8, activation=tf.tanh)
  • prediction = tf.layers.dense(net, 1, activation=tf.tanh)
  • loss = tf.losses.mean_squared_error(prediction, labels) # pass the second value from iter.get_net() as label
  • train_op = tf.train.AdamOptimizer().minimize(loss)
     
  • with tf.Session() as sess:
    • sess.run(tf.global_variables_initializer())
    • # initialise iterator with train data
    • sess.run(train_init_op, feed_dict = {x : train_data[0], y: train_data[1], batch_size: 16})
    • print('Training...')
    • for i in range(EPOCHS):
      • tot_loss = 0
      • for _ in range(n_batches):
        • _, loss_value = sess.run([train_op, loss])
        • tot_loss += loss_value
      • print("Iter: {}, Loss: {:.4f}".format(i, tot_loss / n_batches))
    • # initialise iterator with test data
    • sess.run(test_init_op, feed_dict = {x : test_data[0], y: test_data[1], batch_size:len(test_data[0])})
    • print('Test Loss: {:4f}'.format(sess.run(loss)))

references