Unsupervised learning

M. Benesty

2017-11-09

 library(fastrtext)
    
    data("train_sentences")
    data("test_sentences")
    texts <- tolower(train_sentences[,"text"])
    tmp_file_txt <- tempfile()
    tmp_file_model <- tempfile()
    writeLines(text = texts, con = tmp_file_txt)
    execute(commands = c("skipgram", "-input", tmp_file_txt, "-output", tmp_file_model, "-verbose", 1))
## 
Read 0M words
## Number of words:  2061
## Number of labels: 0
## 
Progress: 100.0%  words/sec/thread: 17780  lr: 0.000000  loss: 2.548323  eta: 0h0m
    model <- load_model(tmp_file_model)
## add .bin extension to the path
    # test word extraction
    dict <- get_dictionary(model)
    print(head(dict, 5))
## [1] "the"  "</s>" "of"   "to"   "and"
  # print vector
  print(get_word_vectors(model, c("time", "timing")))
##             [,1]        [,2]        [,3]      [,4]        [,5]        [,6]
## time   0.1498772  0.02290783 -0.03268287 0.1460678  0.02033125  0.01802958
## timing 0.1878688 -0.03274329 -0.10827913 0.2235800 -0.01211387 -0.01014092
##              [,7]         [,8]        [,9]      [,10]      [,11]
## time   0.07111942 -0.027984247 -0.08977791 -0.2869882 -0.1325668
## timing 0.09716900  0.003319858 -0.05333378 -0.2769039 -0.1442784
##             [,12]       [,13]      [,14]     [,15]     [,16]        [,17]
## time   -0.1092853 -0.09805443 -0.1874921 0.1832326 0.2879117 -0.056885321
## timing -0.1292006 -0.03954900 -0.1708014 0.2198274 0.2438695 -0.009293673
##              [,18]      [,19]      [,20]       [,21]       [,22]
## time   -0.12794995 -0.1799751 0.09233951 -0.02820292 -0.04679530
## timing -0.08198225 -0.1715994 0.05768558 -0.07491447 -0.05680228
##              [,23]     [,24]        [,25]      [,26]      [,27]      [,28]
## time   -0.06751674 0.2138946 -0.001396179 -0.1868226 0.10153314 -0.2061765
## timing -0.07986716 0.2249480  0.037504930 -0.2433395 0.08885589 -0.2458710
##              [,29]       [,30]        [,31]      [,32]      [,33]
## time   -0.01410296 -0.08472162 -0.009906900 0.08482639 -0.4186189
## timing  0.01865401 -0.08530596  0.005858552 0.07160229 -0.5244593
##            [,34]       [,35]       [,36]      [,37]       [,38]      [,39]
## time   0.2955389 -0.07427201 -0.10107207 -0.3715123 -0.02816198 -0.2534733
## timing 0.3799906 -0.15907784 -0.07513484 -0.3624268  0.01578266 -0.1903730
##             [,40]       [,41]     [,42]     [,43]       [,44]      [,45]
## time   -0.1746802 -0.05858436 0.2755241 0.2938558 -0.04910021 -0.1817841
## timing -0.1692442 -0.02336441 0.3107487 0.2383262 -0.04096247 -0.2158681
##             [,46]      [,47]      [,48]      [,49]      [,50]      [,51]
## time   -0.1745448 0.05736335 -0.1090353 -0.1737002 0.09187524 -0.1062071
## timing -0.1200930 0.03326904 -0.1962895 -0.2253897 0.12832397 -0.1376582
##             [,52]     [,53]     [,54]       [,55]      [,56]       [,57]
## time   -0.2053116 0.2093680 0.2131047 -0.06776755 -0.1888424 -0.12263734
## timing -0.1881082 0.3006682 0.1644978 -0.09890648 -0.1641147 -0.09896671
##             [,58]      [,59]       [,60]      [,61]      [,62]      [,63]
## time   -0.2305745 0.08540678 -0.08576719 -0.1192053 -0.1684984 -0.1899439
## timing -0.1775706 0.04817560 -0.07926985 -0.1294395 -0.1791741 -0.1726152
##            [,64]       [,65]      [,66]      [,67]      [,68]       [,69]
## time   0.3026449  0.03970969 -0.2824691 -0.2240442 -0.3609515 -0.00637367
## timing 0.2145917 -0.01082710 -0.3135381 -0.2506468 -0.3227807 -0.04040127
##             [,70]      [,71]     [,72]      [,73]       [,74]        [,75]
## time   -0.1800622 0.02731089 0.1173367 -0.1439909 -0.01305110 -0.006152656
## timing -0.2025167 0.04077831 0.1002024 -0.1438981  0.03699574 -0.014864694
##            [,76]      [,77]     [,78]      [,79]     [,80]      [,81]
## time   0.1035556 0.01196939 0.2482304 -0.1323274 0.2200660 0.02708725
## timing 0.1104460 0.06166702 0.2405703 -0.2046521 0.2070118 0.10725947
##              [,82]       [,83]        [,84]    [,85]       [,86]
## time   0.051380884 -0.02194721 -0.002686401 0.101234 -0.01932771
## timing 0.008298977 -0.04664503 -0.009812647 0.119327 -0.03310017
##              [,87]     [,88]      [,89]      [,90]     [,91]      [,92]
## time   -0.04920077 0.2550172 -0.2522423 -0.1340010 0.2951374 0.04873769
## timing -0.05290403 0.2614736 -0.2779229 -0.1266379 0.3247549 0.01875537
##             [,93]       [,94]      [,95]      [,96]      [,97]      [,98]
## time   0.12032634 -0.07265050 -0.2186155 0.04495415 -0.2032597 -0.1301401
## timing 0.06479455 -0.02078289 -0.2315860 0.10141854 -0.1967423 -0.1266563
##              [,99]     [,100]
## time   -0.10977820 0.09896509
## timing -0.07139827 0.14410906
  # test word distance
  get_word_distance(model, "time", "timing")
##            [,1]
## [1,] 0.02925578
  # free memory
  unlink(tmp_file_txt)
  unlink(tmp_file_model)
  rm(model)
  gc()
##           used (Mb) gc trigger (Mb) max used (Mb)
## Ncells  556647 29.8     940480 50.3   940480 50.3
## Vcells 1156151  8.9    1943012 14.9  1543057 11.8