Neural Machine Translation

The hardware and bandwidth for this mirror is donated by dogado GmbH, the Webhosting and Full Service-Cloud Provider. Check out our Wordpress Tutorial.
If you wish to report a bug, or if you are interested in having us mirror your free-software or open-source project, please feel free to contact us at mirror[@]dogado.de.
library(tensorflow)
library(keras)
library(data.table)
library(tfdatasets)
library(tfaddons)

# Preprocessing -----------------------------------------------------------

# Assumes you've downloaded and unzipped one of the bilingual datasets offered at
# http://www.manythings.org/anki/ and put it into a directory "data"
# This example translates English to Dutch.
download_data = function(){
  if(!dir.exists('data')) {
    dir.create('data')
  }
  if(!file.exists('data/nld-eng.zip')) {
    download.file('http://www.manythings.org/anki/nld-eng.zip',
                  destfile = file.path("data", basename('nld-eng.zip')))
    unzip('data/nld-eng.zip', exdir = 'data')
  }
}

download_data()

filepath <- file.path("data", "nld.txt")

df = data.table::fread(filepath, header = FALSE, encoding = 'UTF-8',
                       select = c(1,2), nrows = -1)

text_cleaner <- function(text){
  text = text %>%
    # replace non ascii
    textclean::replace_non_ascii() %>%
    # remove all non relevant symbols (letters, spaces, and apostrophes are retained)
    textclean::strip(apostrophe.remove = TRUE) %>%
    paste('<start> ', ., ' <end>')
}

df = sapply(1:2, function(x) text_cleaner(df[[x]])) %>% as.data.table()

text_tok <- function(text) {
  tokenizer = text_tokenizer(filters='')
  tokenizer %>% fit_text_tokenizer(text)
  vocab_size = tokenizer$word_index
  data = tokenizer %>%
    texts_to_sequences(text) %>%
    pad_sequences(padding='post')
  list(vocab_size,data,tokenizer)
}

c(input_vocab_size, data_en, tokenizer_en) %<-% c(df[['V1']] %>% text_tok())

c(output_vocab_size, data_de, tokenizer_de) %<-% c(df[['V2']] %>% text_tok())


# Split the dataset
indices_to_take = sample.int(n = nrow(df), size = floor(0.8*nrow(df)), replace = FALSE)

split_data <- function(data) {
  c(train, test) %<-% list(data[indices_to_take, ], data[-indices_to_take, ] )
  list(train, test)
}


c(en_train, en_test, de_train, de_test) %<-% c(split_data(data_en), split_data(data_de))

rm(df, filepath, indices_to_take, download_data, split_data, text_cleaner, text_tok)

batch_size = 64L
buffer_size = nrow(en_train)
steps_per_epoch = buffer_size  %/% batch_size
embedding_dims = 256L
rnn_units = 1024L
dense_units = 1024L
dtype = tf$float32   #used to initialize DecoderCell Zero state


dataset = tensor_slices_dataset(list(en_train, de_train)) %>%
  dataset_shuffle(buffer_size) %>% dataset_batch(batch_size, drop_remainder = TRUE)


EncoderNetwork = reticulate::PyClass(
  'EncoderNetwork',
  inherit = tf$keras$Model,
  defs = list(

    `__init__` = function(self, input_vocab_size, embedding_dims, rnn_units) {

      super()$`__init__`()

      self$encoder_embedding = layer_embedding(input_dim = length(input_vocab_size),
                                               output_dim = embedding_dims)
      self$encoder_rnnlayer = layer_lstm(units = rnn_units, return_sequences = TRUE,
                                         return_state = TRUE)
      NULL
    }
  )
)



DecoderNetwork = reticulate::PyClass(
  'DecoderNetwork',
  inherit = tf$keras$Model,
  defs = list(

    `__init__` = function(self, output_vocab_size, embedding_dims, rnn_units) {

      super()$`__init__`()
      self$decoder_embedding = layer_embedding(input_dim = length(output_vocab_size),
                                               output_dim = embedding_dims)
      self$dense_layer = layer_dense(units = length(output_vocab_size))
      self$decoder_rnncell = tf$keras$layers$LSTMCell(rnn_units)
      # Sampler
      self$sampler = sampler_training()
      # Create attention mechanism with memory = NULL
      self$attention_mechanism = self$build_attention_mechanism(dense_units, NULL, c(rep(ncol(data_en), batch_size)))
      self$rnn_cell =  self$build_rnn_cell(batch_size)
      self$decoder = decoder_basic(cell=self$rnn_cell, sampler = self$sampler,
                                   output_layer = self$dense_layer)
      NULL
    },

    build_attention_mechanism = function(self, units, memory, memory_sequence_length) {
      attention_luong(units = units , memory = memory,
                      memory_sequence_length = memory_sequence_length)
    },

    build_rnn_cell = function(self, batch_size) {
      rnn_cell = attention_wrapper(cell = self$decoder_rnncell,
                                   attention_mechanism = self$attention_mechanism,
                                   attention_layer_size = dense_units)
      rnn_cell
    },

    build_decoder_initial_state = function(self, batch_size, encoder_state, dtype) {
      decoder_initial_state = self$rnn_cell$get_initial_state(batch_size = batch_size,
                                                              dtype = dtype)
      decoder_initial_state = decoder_initial_state$clone(cell_state = encoder_state)
      decoder_initial_state
    }
  )
)

encoderNetwork = EncoderNetwork(input_vocab_size, embedding_dims, rnn_units)
decoderNetwork = DecoderNetwork(output_vocab_size, embedding_dims, rnn_units)
optimizer = tf$keras$optimizers$Adam()



loss_function <- function(y_pred, y) {
  #shape of y [batch_size, ty]
  #shape of y_pred [batch_size, Ty, output_vocab_size]
  loss = keras::loss_sparse_categorical_crossentropy(y, y_pred)
  mask = tf$logical_not(tf$math$equal(y,0L))   #output 0 for y=0 else output 1
  mask = tf$cast(mask, dtype=loss$dtype)
  loss = mask * loss
  loss = tf$reduce_mean(loss)
  loss
}

train_step <- function(input_batch, output_batch,encoder_initial_cell_state) {
  loss = 0L

  with(tf$GradientTape() %as% tape, {
    encoder_emb_inp = encoderNetwork$encoder_embedding(input_batch)
    c(a, a_tx, c_tx) %<-% encoderNetwork$encoder_rnnlayer(encoder_emb_inp,
                                                          initial_state = encoder_initial_cell_state)

    #[last step activations,last memory_state] of encoder passed as input to decoder Network
    # Prepare correct Decoder input & output sequence data
    decoder_input = tf$convert_to_tensor(output_batch %>% as.array() %>% .[,1:45]) # ignore <end>
    #compare logits with timestepped +1 version of decoder_input
    decoder_output = tf$convert_to_tensor(output_batch %>% as.array() %>% .[,2:46]) #ignore <start>

    # Decoder Embeddings
    decoder_emb_inp = decoderNetwork$decoder_embedding(decoder_input)

    #Setting up decoder memory from encoder output and Zero State for AttentionWrapperState
    decoderNetwork$attention_mechanism$setup_memory(a)
    decoder_initial_state = decoderNetwork$build_decoder_initial_state(batch_size,
                                                                       encoder_state = list(a_tx, c_tx),
                                                                       dtype = tf$float32)
    #BasicDecoderOutput
    c(outputs, res1, res2) %<-% decoderNetwork$decoder(decoder_emb_inp,initial_state = decoder_initial_state,
                                                       sequence_length = c(rep(ncol(data_en) - 1L, batch_size)))

    logits = outputs$rnn_output
    #Calculate loss

    loss = loss_function(logits, decoder_output)

  })
  #Returns the list of all layer variables / weights.
  variables = c(encoderNetwork$trainable_variables, decoderNetwork$trainable_variables)
  # differentiate loss wrt variables
  gradients = tape$gradient(loss, variables)
  #grads_and_vars – List of(gradient, variable) pairs.
  grads_and_vars = purrr::transpose(list(gradients,variables))
  optimizer$apply_gradients(grads_and_vars)
  loss
}

initialize_initial_state = function() {
  list(tf$zeros(c(batch_size, rnn_units)), tf$zeros(c(batch_size, rnn_units)))
}


epochs = 1


for (i in 1:sum(epochs + 1)) {
  encoder_initial_cell_state = initialize_initial_state()
  total_loss = 0.0
  res = dataset %>% dataset_take(steps_per_epoch) %>% iterate()
  for (batch in 1:length(res)) {
    c(input_batch, output_batch) %<-% res[[batch]]
    batch_loss = train_step(input_batch, output_batch, encoder_initial_cell_state)
    total_loss = total_loss + batch_loss
    if((batch+1) %% 5 == 0) {
      print(paste('total loss:', batch_loss$numpy(), 'epoch', i, 'batch',batch+1))
    }
  }

}
These binaries (installable software) and packages are in development.
They may not be fully stable and should be used with caution. We make no claims about them.
Health stats visible at Monitor.