Unsupervised learning
M. Benesty
2017-11-09
library(fastrtext)
data("train_sentences")
data("test_sentences")
texts <- tolower(train_sentences[,"text"])
tmp_file_txt <- tempfile()
tmp_file_model <- tempfile()
writeLines(text = texts, con = tmp_file_txt)
execute(commands = c("skipgram", "-input", tmp_file_txt, "-output", tmp_file_model, "-verbose", 1))
##
Read 0M words
## Number of words: 2061
## Number of labels: 0
##
Progress: 100.0% words/sec/thread: 17780 lr: 0.000000 loss: 2.548323 eta: 0h0m
model <- load_model(tmp_file_model)
## add .bin extension to the path
# test word extraction
dict <- get_dictionary(model)
print(head(dict, 5))
## [1] "the" "</s>" "of" "to" "and"
# print vector
print(get_word_vectors(model, c("time", "timing")))
## [,1] [,2] [,3] [,4] [,5] [,6]
## time 0.1498772 0.02290783 -0.03268287 0.1460678 0.02033125 0.01802958
## timing 0.1878688 -0.03274329 -0.10827913 0.2235800 -0.01211387 -0.01014092
## [,7] [,8] [,9] [,10] [,11]
## time 0.07111942 -0.027984247 -0.08977791 -0.2869882 -0.1325668
## timing 0.09716900 0.003319858 -0.05333378 -0.2769039 -0.1442784
## [,12] [,13] [,14] [,15] [,16] [,17]
## time -0.1092853 -0.09805443 -0.1874921 0.1832326 0.2879117 -0.056885321
## timing -0.1292006 -0.03954900 -0.1708014 0.2198274 0.2438695 -0.009293673
## [,18] [,19] [,20] [,21] [,22]
## time -0.12794995 -0.1799751 0.09233951 -0.02820292 -0.04679530
## timing -0.08198225 -0.1715994 0.05768558 -0.07491447 -0.05680228
## [,23] [,24] [,25] [,26] [,27] [,28]
## time -0.06751674 0.2138946 -0.001396179 -0.1868226 0.10153314 -0.2061765
## timing -0.07986716 0.2249480 0.037504930 -0.2433395 0.08885589 -0.2458710
## [,29] [,30] [,31] [,32] [,33]
## time -0.01410296 -0.08472162 -0.009906900 0.08482639 -0.4186189
## timing 0.01865401 -0.08530596 0.005858552 0.07160229 -0.5244593
## [,34] [,35] [,36] [,37] [,38] [,39]
## time 0.2955389 -0.07427201 -0.10107207 -0.3715123 -0.02816198 -0.2534733
## timing 0.3799906 -0.15907784 -0.07513484 -0.3624268 0.01578266 -0.1903730
## [,40] [,41] [,42] [,43] [,44] [,45]
## time -0.1746802 -0.05858436 0.2755241 0.2938558 -0.04910021 -0.1817841
## timing -0.1692442 -0.02336441 0.3107487 0.2383262 -0.04096247 -0.2158681
## [,46] [,47] [,48] [,49] [,50] [,51]
## time -0.1745448 0.05736335 -0.1090353 -0.1737002 0.09187524 -0.1062071
## timing -0.1200930 0.03326904 -0.1962895 -0.2253897 0.12832397 -0.1376582
## [,52] [,53] [,54] [,55] [,56] [,57]
## time -0.2053116 0.2093680 0.2131047 -0.06776755 -0.1888424 -0.12263734
## timing -0.1881082 0.3006682 0.1644978 -0.09890648 -0.1641147 -0.09896671
## [,58] [,59] [,60] [,61] [,62] [,63]
## time -0.2305745 0.08540678 -0.08576719 -0.1192053 -0.1684984 -0.1899439
## timing -0.1775706 0.04817560 -0.07926985 -0.1294395 -0.1791741 -0.1726152
## [,64] [,65] [,66] [,67] [,68] [,69]
## time 0.3026449 0.03970969 -0.2824691 -0.2240442 -0.3609515 -0.00637367
## timing 0.2145917 -0.01082710 -0.3135381 -0.2506468 -0.3227807 -0.04040127
## [,70] [,71] [,72] [,73] [,74] [,75]
## time -0.1800622 0.02731089 0.1173367 -0.1439909 -0.01305110 -0.006152656
## timing -0.2025167 0.04077831 0.1002024 -0.1438981 0.03699574 -0.014864694
## [,76] [,77] [,78] [,79] [,80] [,81]
## time 0.1035556 0.01196939 0.2482304 -0.1323274 0.2200660 0.02708725
## timing 0.1104460 0.06166702 0.2405703 -0.2046521 0.2070118 0.10725947
## [,82] [,83] [,84] [,85] [,86]
## time 0.051380884 -0.02194721 -0.002686401 0.101234 -0.01932771
## timing 0.008298977 -0.04664503 -0.009812647 0.119327 -0.03310017
## [,87] [,88] [,89] [,90] [,91] [,92]
## time -0.04920077 0.2550172 -0.2522423 -0.1340010 0.2951374 0.04873769
## timing -0.05290403 0.2614736 -0.2779229 -0.1266379 0.3247549 0.01875537
## [,93] [,94] [,95] [,96] [,97] [,98]
## time 0.12032634 -0.07265050 -0.2186155 0.04495415 -0.2032597 -0.1301401
## timing 0.06479455 -0.02078289 -0.2315860 0.10141854 -0.1967423 -0.1266563
## [,99] [,100]
## time -0.10977820 0.09896509
## timing -0.07139827 0.14410906
# test word distance
get_word_distance(model, "time", "timing")
## [,1]
## [1,] 0.02925578
# free memory
unlink(tmp_file_txt)
unlink(tmp_file_model)
rm(model)
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 556647 29.8 940480 50.3 940480 50.3
## Vcells 1156151 8.9 1943012 14.9 1543057 11.8