if not modules then modules = { } end modules ['l-unicode'] = { version = 1.001, optimize = true, comment = "companion to luat-lib.mkiv", author = "Hans Hagen, PRAGMA-ADE, Hasselt NL", copyright = "PRAGMA ADE / ConTeXt Development Team", license = "see context related readme files" } -- floor(b/256) => rshift(b, 8) -- floor(b/1024) => rshift(b,10) -- in lua 5.3: -- utf8.char(···) : concatinated -- utf8.charpatt : "[\0-\x7F\xC2-\xF4][\x80-\xBF]*" -- utf8.codes(s) : for p, c in utf8.codes(s) do body end -- utf8.codepoint(s [, i [, j]]) -- utf8.len(s [, i]) -- utf8.offset(s, n [, i]) -- todo: utf.sub replacement (used in syst-aux) -- we put these in the utf namespace: -- used : byte char len lower sub upper -- not used : dump find format gmatch gfind gsub match rep reverse -- utf = utf or (unicode and unicode.utf8) or { } -- not supported: -- -- dump, find, format, gfind, gmatch, gsub, lower, match, rep, reverse, upper utf = utf or { } unicode = nil if not string.utfcharacters then -- New: this gmatch hack is taken from the Lua 5.2 book. It's about two times slower -- than the built-in string.utfcharacters. local gmatch = string.gmatch function string.characters(str) return gmatch(str,".[\128-\191]*") end end utf.characters = string.utfcharacters -- string.utfvalues -- string.utfcharacters -- string.characters -- string.characterpairs -- string.bytes -- string.bytepairs -- string.utflength -- string.utfvalues -- string.utfcharacters local type = type local char, byte, format, sub, gmatch = string.char, string.byte, string.format, string.sub, string.gmatch local concat = table.concat local P, C, R, Cs, Ct, Cmt, Cc, Carg, Cp = lpeg.P, lpeg.C, lpeg.R, lpeg.Cs, lpeg.Ct, lpeg.Cmt, lpeg.Cc, lpeg.Carg, lpeg.Cp local lpegmatch = lpeg.match local patterns = lpeg.patterns local tabletopattern = lpeg.utfchartabletopattern local bytepairs = string.bytepairs local finder = lpeg.finder local replacer = lpeg.replacer local p_utftype = patterns.utftype local p_utfstricttype = patterns.utfstricttype local p_utfoffset = patterns.utfoffset local p_utf8character = patterns.utf8character local p_utf8char = patterns.utf8char local p_utf8byte = patterns.utf8byte local p_utfbom = patterns.utfbom local p_newline = patterns.newline local p_whitespace = patterns.whitespace -- if not unicode then -- unicode = { utf = utf } -- for a while -- end if not utf.char then utf.char = string.utfcharacter or (utf8 and utf8.char) if not utf.char then -- no multiples local char = string.char if bit32 then local rshift = bit32.rshift function utf.char(n) if n < 0x80 then -- 0aaaaaaa : 0x80 return char(n) elseif n < 0x800 then -- 110bbbaa : 0xC0 : n >> 6 -- 10aaaaaa : 0x80 : n & 0x3F return char( 0xC0 + rshift(n,6), 0x80 + (n % 0x40) ) elseif n < 0x10000 then -- 1110bbbb : 0xE0 : n >> 12 -- 10bbbbaa : 0x80 : (n >> 6) & 0x3F -- 10aaaaaa : 0x80 : n & 0x3F return char( 0xE0 + rshift(n,12), 0x80 + (rshift(n,6) % 0x40), 0x80 + (n % 0x40) ) elseif n < 0x200000 then -- 11110ccc : 0xF0 : n >> 18 -- 10ccbbbb : 0x80 : (n >> 12) & 0x3F -- 10bbbbaa : 0x80 : (n >> 6) & 0x3F -- 10aaaaaa : 0x80 : n & 0x3F -- dddd : ccccc - 1 return char( 0xF0 + rshift(n,18), 0x80 + (rshift(n,12) % 0x40), 0x80 + (rshift(n,6) % 0x40), 0x80 + (n % 0x40) ) else return "" end end else local floor = math.floor function utf.char(n) if n < 0x80 then return char(n) elseif n < 0x800 then return char( 0xC0 + floor(n/0x40), 0x80 + (n % 0x40) ) elseif n < 0x10000 then return char( 0xE0 + floor(n/0x1000), 0x80 + (floor(n/0x40) % 0x40), 0x80 + (n % 0x40) ) elseif n < 0x200000 then return char( 0xF0 + floor(n/0x40000), 0x80 + (floor(n/0x1000) % 0x40), 0x80 + (floor(n/0x40) % 0x40), 0x80 + (n % 0x40) ) else return "" end end end end end if not utf.byte then utf.byte = string.utfvalue or (utf8 and utf8.codepoint) if not utf.byte then function utf.byte(c) return lpegmatch(p_utf8byte,c) end end end local utfchar, utfbyte = utf.char, utf.byte -- As we want to get rid of the (unmaintained) utf library we implement our own -- variants (in due time an independent module): function utf.filetype(data) return data and lpegmatch(p_utftype,data) or "unknown" end local toentities = Cs ( ( patterns.utf8one + ( patterns.utf8two + patterns.utf8three + patterns.utf8four ) / function(s) local b = utfbyte(s) if b < 127 then return s else return format("&#%X;",b) end end )^0 ) patterns.toentities = toentities function utf.toentities(str) return lpegmatch(toentities,str) end -- local utfchr = { } -- 60K -> 2.638 M extra mem but currently not called that often (on latin) -- -- setmetatable(utfchr, { __index = function(t,k) local v = utfchar(k) t[k] = v return v end } ) -- -- collectgarbage("collect") -- local u = collectgarbage("count")*1024 -- local t = os.clock() -- for i=1,1000 do -- for i=1,600 do -- local a = utfchr[i] -- end -- end -- print(os.clock()-t,collectgarbage("count")*1024-u) -- collectgarbage("collect") -- local t = os.clock() -- for i=1,1000 do -- for i=1,600 do -- local a = utfchar(i) -- end -- end -- print(os.clock()-t,collectgarbage("count")*1024-u) -- local byte = string.byte -- local utfchar = utf.char local one = P(1) local two = C(1) * C(1) local four = C(R(utfchar(0xD8),utfchar(0xFF))) * C(1) * C(1) * C(1) local pattern = P("\254\255") * Cs( ( four / function(a,b,c,d) local ab = 0xFF * byte(a) + byte(b) local cd = 0xFF * byte(c) + byte(d) return utfchar((ab-0xD800)*0x400 + (cd-0xDC00) + 0x10000) end + two / function(a,b) return utfchar(byte(a)*256 + byte(b)) end + one )^1 ) + P("\255\254") * Cs( ( four / function(b,a,d,c) local ab = 0xFF * byte(a) + byte(b) local cd = 0xFF * byte(c) + byte(d) return utfchar((ab-0xD800)*0x400 + (cd-0xDC00) + 0x10000) end + two / function(b,a) return utfchar(byte(a)*256 + byte(b)) end + one )^1 ) function string.toutf(s) -- in string namespace return lpegmatch(pattern,s) or s -- todo: utf32 end local validatedutf = Cs ( ( patterns.utf8one + patterns.utf8two + patterns.utf8three + patterns.utf8four + P(1) / "�" )^0 ) patterns.validatedutf = validatedutf function utf.is_valid(str) return type(str) == "string" and lpegmatch(validatedutf,str) or false end if not utf.len then utf.len = string.utflength or (utf8 and utf8.len) if not utf.len then -- -- alternative 1: 0.77 -- -- local utfcharcounter = utfbom^-1 * Cs((p_utf8character/'!')^0) -- -- function utf.len(str) -- return #lpegmatch(utfcharcounter,str or "") -- end -- -- -- alternative 2: 1.70 -- -- local n = 0 -- -- local utfcharcounter = utfbom^-1 * (p_utf8character/function() n = n + 1 end)^0 -- slow -- -- function utf.length(str) -- n = 0 -- lpegmatch(utfcharcounter,str or "") -- return n -- end -- -- -- alternative 3: 0.24 (native unicode.utf8.len: 0.047) -- local n = 0 -- -- -- local utfcharcounter = lpeg.patterns.utfbom^-1 * P ( ( Cp() * ( -- -- patterns.utf8one ^1 * Cc(1) -- -- + patterns.utf8two ^1 * Cc(2) -- -- + patterns.utf8three^1 * Cc(3) -- -- + patterns.utf8four ^1 * Cc(4) ) * Cp() / function(f,d,t) n = n + (t - f)/d end -- -- )^0 ) -- just as many captures as below -- -- -- local utfcharcounter = lpeg.patterns.utfbom^-1 * P ( ( -- -- (Cmt(patterns.utf8one ^1,function(_,_,s) n = n + #s return true end)) -- -- + (Cmt(patterns.utf8two ^1,function(_,_,s) n = n + #s/2 return true end)) -- -- + (Cmt(patterns.utf8three^1,function(_,_,s) n = n + #s/3 return true end)) -- -- + (Cmt(patterns.utf8four ^1,function(_,_,s) n = n + #s/4 return true end)) -- -- )^0 ) -- not interesting as it creates strings but sometimes faster -- -- -- The best so far: -- -- local utfcharcounter = utfbom^-1 * P ( ( -- Cp() * (patterns.utf8one )^1 * Cp() / function(f,t) n = n + t - f end -- + Cp() * (patterns.utf8two )^1 * Cp() / function(f,t) n = n + (t - f)/2 end -- + Cp() * (patterns.utf8three)^1 * Cp() / function(f,t) n = n + (t - f)/3 end -- + Cp() * (patterns.utf8four )^1 * Cp() / function(f,t) n = n + (t - f)/4 end -- )^0 ) -- function utf.len(str) -- n = 0 -- lpegmatch(utfcharcounter,str or "") -- return n -- end local n, f = 0, 1 local utfcharcounter = patterns.utfbom^-1 * Cmt ( Cc(1) * patterns.utf8one ^1 + Cc(2) * patterns.utf8two ^1 + Cc(3) * patterns.utf8three^1 + Cc(4) * patterns.utf8four ^1, function(_,t,d) -- due to Cc no string captures, so faster n = n + (t - f)/d f = t return true end )^0 function utf.len(str) n, f = 0, 1 lpegmatch(utfcharcounter,str or "") return n end -- -- these are quite a bit slower: -- utfcharcounter = utfbom^-1 * (Cmt(P(1) * R("\128\191")^0, function() n = n + 1 return true end))^0 -- 50+ times slower -- utfcharcounter = utfbom^-1 * (Cmt(P(1), function() n = n + 1 return true end) * R("\128\191")^0)^0 -- 50- times slower end end utf.length = utf.len if not utf.sub then -- inefficient as lpeg just copies ^n -- local function sub(str,start,stop) -- local pattern = p_utf8character^-(start-1) * C(p_utf8character^-(stop-start+1)) -- inspect(pattern) -- return lpegmatch(pattern,str) or "" -- end -- local b, e, n, first, last = 0, 0, 0, 0, 0 -- -- local function slide(s,p) -- n = n + 1 -- if n == first then -- b = p -- if not last then -- return nil -- end -- end -- if n == last then -- e = p -- return nil -- else -- return p -- end -- end -- -- local pattern = Cmt(p_utf8character,slide)^0 -- -- function utf.sub(str,start,stop) -- todo: from the end -- if not start then -- return str -- end -- b, e, n, first, last = 0, 0, 0, start, stop -- lpegmatch(pattern,str) -- if not stop then -- return sub(str,b) -- else -- return sub(str,b,e-1) -- end -- end -- print(utf.sub("Hans Hagen is my name")) -- print(utf.sub("Hans Hagen is my name",5)) -- print(utf.sub("Hans Hagen is my name",5,10)) local utflength = utf.length -- also negative indices, upto 10 times slower than a c variant local b, e, n, first, last = 0, 0, 0, 0, 0 local function slide_zero(s,p) n = n + 1 if n >= last then e = p - 1 else return p end end local function slide_one(s,p) n = n + 1 if n == first then b = p end if n >= last then e = p - 1 else return p end end local function slide_two(s,p) n = n + 1 if n == first then b = p else return true end end local pattern_zero = Cmt(p_utf8character,slide_zero)^0 local pattern_one = Cmt(p_utf8character,slide_one )^0 local pattern_two = Cmt(p_utf8character,slide_two )^0 local pattern_first = C(p_utf8character) function utf.sub(str,start,stop) if not start then return str end if start == 0 then start = 1 end if not stop then if start < 0 then local l = utflength(str) -- we can inline this function if needed start = l + start else start = start - 1 end b, n, first = 0, 0, start lpegmatch(pattern_two,str) if n >= first then return sub(str,b) else return "" end end if start < 0 or stop < 0 then local l = utf.length(str) if start < 0 then start = l + start if start <= 0 then start = 1 else start = start + 1 end end if stop < 0 then stop = l + stop if stop == 0 then stop = 1 else stop = stop + 1 end end end if start == 1 and stop == 1 then return lpegmatch(pattern_first,str) or "" elseif start > stop then return "" elseif start > 1 then b, e, n, first, last = 0, 0, 0, start - 1, stop lpegmatch(pattern_one,str) if n >= first and e == 0 then e = #str end return sub(str,b,e) else b, e, n, last = 1, 0, 0, stop lpegmatch(pattern_zero,str) if e == 0 then e = #str end return sub(str,b,e) end end -- local n = 100000 -- local str = string.rep("123456àáâãäå",100) -- -- for i=-15,15,1 do -- for j=-15,15,1 do -- if utf.xsub(str,i,j) ~= utf.sub(str,i,j) then -- print("error",i,j,"l>"..utf.xsub(str,i,j),"s>"..utf.sub(str,i,j)) -- end -- end -- if utf.xsub(str,i) ~= utf.sub(str,i) then -- print("error",i,"l>"..utf.xsub(str,i),"s>"..utf.sub(str,i)) -- end -- end -- print(" 1, 7",utf.xsub(str, 1, 7),utf.sub(str, 1, 7)) -- print(" 0, 7",utf.xsub(str, 0, 7),utf.sub(str, 0, 7)) -- print(" 0, 9",utf.xsub(str, 0, 9),utf.sub(str, 0, 9)) -- print(" 4 ",utf.xsub(str, 4 ),utf.sub(str, 4 )) -- print(" 0 ",utf.xsub(str, 0 ),utf.sub(str, 0 )) -- print(" 0, 0",utf.xsub(str, 0, 0),utf.sub(str, 0, 0)) -- print(" 4, 4",utf.xsub(str, 4, 4),utf.sub(str, 4, 4)) -- print(" 4, 0",utf.xsub(str, 4, 0),utf.sub(str, 4, 0)) -- print("-3, 0",utf.xsub(str,-3, 0),utf.sub(str,-3, 0)) -- print(" 0,-3",utf.xsub(str, 0,-3),utf.sub(str, 0,-3)) -- print(" 5,-3",utf.xsub(str,-5,-3),utf.sub(str,-5,-3)) -- print("-3 ",utf.xsub(str,-3 ),utf.sub(str,-3 )) end -- a replacement for simple gsubs: -- function utf.remapper(mapping) -- local pattern = Cs((p_utf8character/mapping)^0) -- return function(str) -- if not str or str == "" then -- return "" -- else -- return lpegmatch(pattern,str) -- end -- end, pattern -- end function utf.remapper(mapping,option,action) -- static also returns a pattern local variant = type(mapping) if variant == "table" then action = action or mapping if option == "dynamic" then local pattern = false table.setmetatablenewindex(mapping,function(t,k,v) rawset(t,k,v) pattern = false end) return function(str) if not str or str == "" then return "" else if not pattern then pattern = Cs((tabletopattern(mapping)/action + p_utf8character)^0) end return lpegmatch(pattern,str) end end elseif option == "pattern" then return Cs((tabletopattern(mapping)/action + p_utf8character)^0) -- elseif option == "static" then else local pattern = Cs((tabletopattern(mapping)/action + p_utf8character)^0) return function(str) if not str or str == "" then return "" else return lpegmatch(pattern,str) end end, pattern end elseif variant == "function" then if option == "pattern" then return Cs((p_utf8character/mapping + p_utf8character)^0) else local pattern = Cs((p_utf8character/mapping + p_utf8character)^0) return function(str) if not str or str == "" then return "" else return lpegmatch(pattern,str) end end, pattern end else -- is actually an error return function(str) return str or "" end end end -- local remap = utf.remapper { a = 'd', b = "c", c = "b", d = "a" } -- print(remap("abcd 1234 abcd")) function utf.replacer(t) -- no precheck, always string builder local r = replacer(t,false,false,true) return function(str) return lpegmatch(r,str) end end function utf.subtituter(t) -- with precheck and no building if no match local f = finder (t) local r = replacer(t,false,false,true) return function(str) local i = lpegmatch(f,str) if not i then return str elseif i > #str then return str else -- return sub(str,1,i-2) .. lpegmatch(r,str,i-1) -- slower return lpegmatch(r,str) end end end -- inspect(utf.split("a b c d")) -- inspect(utf.split("a b c d",true)) local utflinesplitter = p_utfbom^-1 * lpeg.tsplitat(p_newline) local utfcharsplitter_ows = p_utfbom^-1 * Ct(C(p_utf8character)^0) local utfcharsplitter_iws = p_utfbom^-1 * Ct((p_whitespace^1 + C(p_utf8character))^0) local utfcharsplitter_raw = Ct(C(p_utf8character)^0) patterns.utflinesplitter = utflinesplitter function utf.splitlines(str) return lpegmatch(utflinesplitter,str or "") end function utf.split(str,ignorewhitespace) -- new if ignorewhitespace then return lpegmatch(utfcharsplitter_iws,str or "") else return lpegmatch(utfcharsplitter_ows,str or "") end end function utf.totable(str) -- keeps bom return lpegmatch(utfcharsplitter_raw,str) end -- 0 EF BB BF UTF-8 -- 1 FF FE UTF-16-little-endian -- 2 FE FF UTF-16-big-endian -- 3 FF FE 00 00 UTF-32-little-endian -- 4 00 00 FE FF UTF-32-big-endian -- -- \000 fails in <= 5.0 but is valid in >=5.1 where %z is depricated -- utf.name = { -- [0] = 'utf-8', -- [1] = 'utf-16-le', -- [2] = 'utf-16-be', -- [3] = 'utf-32-le', -- [4] = 'utf-32-be' -- } -- -- function utf.magic(f) -- local str = f:read(4) -- if not str then -- f:seek('set') -- return 0 -- -- elseif find(str,"^%z%z\254\255") then -- depricated -- -- elseif find(str,"^\000\000\254\255") then -- not permitted and bugged -- elseif find(str,"\000\000\254\255",1,true) then -- seems to work okay (TH) -- return 4 -- -- elseif find(str,"^\255\254%z%z") then -- depricated -- -- elseif find(str,"^\255\254\000\000") then -- not permitted and bugged -- elseif find(str,"\255\254\000\000",1,true) then -- seems to work okay (TH) -- return 3 -- elseif find(str,"^\254\255") then -- f:seek('set',2) -- return 2 -- elseif find(str,"^\255\254") then -- f:seek('set',2) -- return 1 -- elseif find(str,"^\239\187\191") then -- f:seek('set',3) -- return 0 -- else -- f:seek('set') -- return 0 -- end -- end function utf.magic(f) -- not used local str = f:read(4) or "" local off = lpegmatch(p_utfoffset,str) if off < 4 then f:seek('set',off) end return lpegmatch(p_utftype,str) end local utf16_to_utf8_be, utf16_to_utf8_le local utf32_to_utf8_be, utf32_to_utf8_le local utf_16_be_getbom = patterns.utfbom_16_be^-1 local utf_16_le_getbom = patterns.utfbom_16_le^-1 local utf_32_be_getbom = patterns.utfbom_32_be^-1 local utf_32_le_getbom = patterns.utfbom_32_le^-1 local utf_16_be_linesplitter = utf_16_be_getbom * lpeg.tsplitat(patterns.utf_16_be_nl) local utf_16_le_linesplitter = utf_16_le_getbom * lpeg.tsplitat(patterns.utf_16_le_nl) local utf_32_be_linesplitter = utf_32_be_getbom * lpeg.tsplitat(patterns.utf_32_be_nl) local utf_32_le_linesplitter = utf_32_le_getbom * lpeg.tsplitat(patterns.utf_32_le_nl) -- we have three possibilities: bytepairs (using tables), gmatch (using tables), gsub and -- lpeg. Bytepairs are the fastert but as soon as we need to remove bombs and so the gain -- is less due to more testing. Also, we seldom have to convert utf16 so we don't care to -- much about a few milliseconds more runtime. The lpeg variant is upto 20% slower but -- still pretty fast. -- -- for historic resone we keep the bytepairs variants around .. beware they don't grab the -- bom like the lpegs do so they're not dropins in the functions that follow -- -- utf16_to_utf8_be = function(s) -- if not s then -- return nil -- elseif s == "" then -- return "" -- end -- local result, r, more = { }, 0, 0 -- for left, right in bytepairs(s) do -- if right then -- local now = 256*left + right -- if more > 0 then -- now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000 -- the 0x10000 smells wrong -- more = 0 -- r = r + 1 -- result[r] = utfchar(now) -- elseif now >= 0xD800 and now <= 0xDBFF then -- more = now -- else -- r = r + 1 -- result[r] = utfchar(now) -- end -- end -- end -- return concat(result) -- end -- -- local utf16_to_utf8_be_t = function(t) -- if not t then -- return nil -- elseif type(t) == "string" then -- t = lpegmatch(utf_16_be_linesplitter,t) -- end -- local result = { } -- we reuse result -- for i=1,#t do -- local s = t[i] -- if s ~= "" then -- local r, more = 0, 0 -- for left, right in bytepairs(s) do -- if right then -- local now = 256*left + right -- if more > 0 then -- now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000 -- more = 0 -- r = r + 1 -- result[r] = utfchar(now) -- elseif now >= 0xD800 and now <= 0xDBFF then -- more = now -- else -- r = r + 1 -- result[r] = utfchar(now) -- end -- end -- end -- t[i] = concat(result,"",1,r) -- we reused tmp, hence t -- end -- end -- return t -- end -- -- utf16_to_utf8_le = function(s) -- if not s then -- return nil -- elseif s == "" then -- return "" -- end -- local result, r, more = { }, 0, 0 -- for left, right in bytepairs(s) do -- if right then -- local now = 256*right + left -- if more > 0 then -- now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000 -- more = 0 -- r = r + 1 -- result[r] = utfchar(now) -- elseif now >= 0xD800 and now <= 0xDBFF then -- more = now -- else -- r = r + 1 -- result[r] = utfchar(now) -- end -- end -- end -- return concat(result) -- end -- -- local utf16_to_utf8_le_t = function(t) -- if not t then -- return nil -- elseif type(t) == "string" then -- t = lpegmatch(utf_16_le_linesplitter,t) -- end -- local result = { } -- we reuse result -- for i=1,#t do -- local s = t[i] -- if s ~= "" then -- local r, more = 0, 0 -- for left, right in bytepairs(s) do -- if right then -- local now = 256*right + left -- if more > 0 then -- now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000 -- more = 0 -- r = r + 1 -- result[r] = utfchar(now) -- elseif now >= 0xD800 and now <= 0xDBFF then -- more = now -- else -- r = r + 1 -- result[r] = utfchar(now) -- end -- end -- end -- t[i] = concat(result,"",1,r) -- we reused tmp, hence t -- end -- end -- return t -- end -- -- local utf32_to_utf8_be_t = function(t) -- if not t then -- return nil -- elseif type(t) == "string" then -- t = lpegmatch(utflinesplitter,t) -- end -- local result = { } -- we reuse result -- for i=1,#t do -- local r, more = 0, -1 -- for a,b in bytepairs(t[i]) do -- if a and b then -- if more < 0 then -- more = 256*256*256*a + 256*256*b -- else -- r = r + 1 -- result[t] = utfchar(more + 256*a + b) -- more = -1 -- end -- else -- break -- end -- end -- t[i] = concat(result,"",1,r) -- end -- return t -- end -- -- local utf32_to_utf8_le_t = function(t) -- if not t then -- return nil -- elseif type(t) == "string" then -- t = lpegmatch(utflinesplitter,t) -- end -- local result = { } -- we reuse result -- for i=1,#t do -- local r, more = 0, -1 -- for a,b in bytepairs(t[i]) do -- if a and b then -- if more < 0 then -- more = 256*b + a -- else -- r = r + 1 -- result[t] = utfchar(more + 256*256*256*b + 256*256*a) -- more = -1 -- end -- else -- break -- end -- end -- t[i] = concat(result,"",1,r) -- end -- return t -- end local more = 0 local p_utf16_to_utf8_be = C(1) * C(1) /function(left,right) local now = 256*byte(left) + byte(right) if more > 0 then now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000 more = 0 return utfchar(now) elseif now >= 0xD800 and now <= 0xDBFF then more = now return "" -- else the c's end up in the stream else return utfchar(now) end end local p_utf16_to_utf8_le = C(1) * C(1) /function(right,left) local now = 256*byte(left) + byte(right) if more > 0 then now = (more-0xD800)*0x400 + (now-0xDC00) + 0x10000 more = 0 return utfchar(now) elseif now >= 0xD800 and now <= 0xDBFF then more = now return "" -- else the c's end up in the stream else return utfchar(now) end end local p_utf32_to_utf8_be = C(1) * C(1) * C(1) * C(1) /function(a,b,c,d) return utfchar(256*256*256*byte(a) + 256*256*byte(b) + 256*byte(c) + byte(d)) end local p_utf32_to_utf8_le = C(1) * C(1) * C(1) * C(1) /function(a,b,c,d) return utfchar(256*256*256*byte(d) + 256*256*byte(c) + 256*byte(b) + byte(a)) end p_utf16_to_utf8_be = P(true) / function() more = 0 end * utf_16_be_getbom * Cs(p_utf16_to_utf8_be^0) p_utf16_to_utf8_le = P(true) / function() more = 0 end * utf_16_le_getbom * Cs(p_utf16_to_utf8_le^0) p_utf32_to_utf8_be = P(true) / function() more = 0 end * utf_32_be_getbom * Cs(p_utf32_to_utf8_be^0) p_utf32_to_utf8_le = P(true) / function() more = 0 end * utf_32_le_getbom * Cs(p_utf32_to_utf8_le^0) patterns.utf16_to_utf8_be = p_utf16_to_utf8_be patterns.utf16_to_utf8_le = p_utf16_to_utf8_le patterns.utf32_to_utf8_be = p_utf32_to_utf8_be patterns.utf32_to_utf8_le = p_utf32_to_utf8_le utf16_to_utf8_be = function(s) if s and s ~= "" then return lpegmatch(p_utf16_to_utf8_be,s) else return s end end local utf16_to_utf8_be_t = function(t) if not t then return nil elseif type(t) == "string" then t = lpegmatch(utf_16_be_linesplitter,t) end for i=1,#t do local s = t[i] if s ~= "" then t[i] = lpegmatch(p_utf16_to_utf8_be,s) end end return t end utf16_to_utf8_le = function(s) if s and s ~= "" then return lpegmatch(p_utf16_to_utf8_le,s) else return s end end local utf16_to_utf8_le_t = function(t) if not t then return nil elseif type(t) == "string" then t = lpegmatch(utf_16_le_linesplitter,t) end for i=1,#t do local s = t[i] if s ~= "" then t[i] = lpegmatch(p_utf16_to_utf8_le,s) end end return t end utf32_to_utf8_be = function(s) if s and s ~= "" then return lpegmatch(p_utf32_to_utf8_be,s) else return s end end local utf32_to_utf8_be_t = function(t) if not t then return nil elseif type(t) == "string" then t = lpegmatch(utf_32_be_linesplitter,t) end for i=1,#t do local s = t[i] if s ~= "" then t[i] = lpegmatch(p_utf32_to_utf8_be,s) end end return t end utf32_to_utf8_le = function(s) if s and s ~= "" then return lpegmatch(p_utf32_to_utf8_le,s) else return s end end local utf32_to_utf8_le_t = function(t) if not t then return nil elseif type(t) == "string" then t = lpegmatch(utf_32_le_linesplitter,t) end for i=1,#t do local s = t[i] if s ~= "" then t[i] = lpegmatch(p_utf32_to_utf8_le,s) end end return t end utf.utf16_to_utf8_le_t = utf16_to_utf8_le_t utf.utf16_to_utf8_be_t = utf16_to_utf8_be_t utf.utf32_to_utf8_le_t = utf32_to_utf8_le_t utf.utf32_to_utf8_be_t = utf32_to_utf8_be_t utf.utf16_to_utf8_le = utf16_to_utf8_le utf.utf16_to_utf8_be = utf16_to_utf8_be utf.utf32_to_utf8_le = utf32_to_utf8_le utf.utf32_to_utf8_be = utf32_to_utf8_be function utf.utf8_to_utf8_t(t) return type(t) == "string" and lpegmatch(utflinesplitter,t) or t end function utf.utf16_to_utf8_t(t,endian) return endian and utf16_to_utf8_be_t(t) or utf16_to_utf8_le_t(t) or t end function utf.utf32_to_utf8_t(t,endian) return endian and utf32_to_utf8_be_t(t) or utf32_to_utf8_le_t(t) or t end if bit32 then local rshift = bit32.rshift local function little(b) if b < 0x10000 then return char(b%256,rshift(b,8)) else b = b - 0x10000 local b1 = rshift(b,10) + 0xD800 local b2 = b%1024 + 0xDC00 return char(b1%256,rshift(b1,8),b2%256,rshift(b2,8)) end end local function big(b) if b < 0x10000 then return char(rshift(b,8),b%256) else b = b - 0x10000 local b1 = rshift(b,10) + 0xD800 local b2 = b%1024 + 0xDC00 return char(rshift(b1,8),b1%256,rshift(b2,8),b2%256) end end local l_remap = Cs((p_utf8byte/little+P(1)/"")^0) local b_remap = Cs((p_utf8byte/big +P(1)/"")^0) local function utf8_to_utf16_be(str,nobom) if nobom then return lpegmatch(b_remap,str) else return char(254,255) .. lpegmatch(b_remap,str) end end local function utf8_to_utf16_le(str,nobom) if nobom then return lpegmatch(l_remap,str) else return char(255,254) .. lpegmatch(l_remap,str) end end utf.utf8_to_utf16_be = utf8_to_utf16_be utf.utf8_to_utf16_le = utf8_to_utf16_le function utf.utf8_to_utf16(str,littleendian,nobom) if littleendian then return utf8_to_utf16_le(str,nobom) else return utf8_to_utf16_be(str,nobom) end end end local pattern = Cs ( (p_utf8byte / function(unicode ) return format( "0x%04X", unicode) end) * (p_utf8byte * Carg(1) / function(unicode,separator) return format("%s0x%04X",separator,unicode) end)^0 ) function utf.tocodes(str,separator) return lpegmatch(pattern,str,1,separator or " ") end function utf.ustring(s) return format("U+%05X",type(s) == "number" and s or utfbyte(s)) end function utf.xstring(s) return format("0x%05X",type(s) == "number" and s or utfbyte(s)) end function utf.toeight(str) if not str or str == "" then return nil end local utftype = lpegmatch(p_utfstricttype,str) if utftype == "utf-8" then return sub(str,4) -- remove the bom elseif utftype == "utf-16-be" then return utf16_to_utf8_be(str) -- bom gets removed elseif utftype == "utf-16-le" then return utf16_to_utf8_le(str) -- bom gets removed else return str end end -- do local p_nany = p_utf8character / "" local cache = { } function utf.count(str,what) if type(what) == "string" then local p = cache[what] if not p then p = Cs((P(what)/" " + p_nany)^0) cache[p] = p end return #lpegmatch(p,str) else -- 4 times slower but still faster than / function return #lpegmatch(Cs((P(what)/" " + p_nany)^0),str) end end end if not string.utfvalues then -- So, a logical next step is to check for the values variant. It over five times -- slower than the built-in string.utfvalues. I optimized it a bit for n=0,1. ----- wrap, yield, gmatch = coroutine.wrap, coroutine.yield, string.gmatch local find = string.find local dummy = function() -- we share this one end -- function string.utfvalues(str) -- local n = #str -- if n == 0 then -- return wrap(dummy) -- elseif n == 1 then -- return wrap(function() yield(utfbyte(str)) end) -- else -- return wrap(function() for s in gmatch(str,".[\128-\191]*") do -- yield(utfbyte(s)) -- end end) -- end -- end -- -- faster: function string.utfvalues(str) local n = #str if n == 0 then return dummy elseif n == 1 then return function() return utfbyte(str) end else local p = 1 -- local n = #str return function() -- if p <= n then -- slower than the last find local b, e = find(str,".[\128-\191]*",p) if b then p = e + 1 return utfbyte(sub(str,b,e)) end -- end end end end -- slower: -- -- local pattern = C(p_utf8character) * Cp() -- ----- pattern = p_utf8character/utfbyte * Cp() -- ----- pattern = p_utf8byte * Cp() -- -- function string.utfvalues(str) -- one of the cases where a find is faster than an lpeg -- local n = #str -- if n == 0 then -- return dummy -- elseif n == 1 then -- return function() return utfbyte(str) end -- else -- local p = 1 -- return function() -- local s, e = lpegmatch(pattern,str,p) -- if e then -- p = e -- return utfbyte(s) -- -- return s -- end -- end -- end -- end end utf.values = string.utfvalues function utf.chrlen(u) -- u is number return (u < 0x80 and 1) or (u < 0xE0 and 2) or (u < 0xF0 and 3) or (u < 0xF8 and 4) or (u < 0xFC and 5) or (u < 0xFE and 6) or 0 end -- hashing saves a little but not that much in practice -- -- local utf32 = table.setmetatableindex(function(t,k) local v = toutf32(k) t[k] = v return v end) if bit32 then local extract = bit32.extract local char = string.char function utf.toutf32string(n) if n <= 0xFF then return char(n) .. "\000\000\000" elseif n <= 0xFFFF then return char(extract(n, 0,8)) .. char(extract(n, 8,8)) .. "\000\000" elseif n <= 0xFFFFFF then return char(extract(n, 0,8)) .. char(extract(n, 8,8)) .. char(extract(n,16,8)) .. "\000" else return char(extract(n, 0,8)) .. char(extract(n, 8,8)) .. char(extract(n,16,8)) .. char(extract(n,24,8)) end end end -- goodie: local len = utf.len local rep = rep function string.utfpadd(s,n) if n and n ~= 0 then local l = len(s) if n > 0 then local d = n - l if d > 0 then return rep(c or " ",d) .. s end else local d = - n - l if d > 0 then return s .. rep(c or " ",d) end end end return s end -- goodies do local utfcharacters = utf.characters or string.utfcharacters local utfchar = utf.char or string.utfcharacter lpeg.UP = P if utfcharacters then function lpeg.US(str) local p = P(false) for uc in utfcharacters(str) do p = p + P(uc) end return p end else function lpeg.US(str) local p = P(false) local f = function(uc) p = p + P(uc) end lpegmatch((p_utf8char/f)^0,str) return p end end local range = p_utf8byte * p_utf8byte + Cc(false) -- utf8byte is already a capture function lpeg.UR(str,more) local first, last if type(str) == "number" then first = str last = more or first else first, last = lpegmatch(range,str) if not last then return P(str) end end if first == last then return P(str) end if not utfchar then utfchar = utf.char -- maybe delayed end if utfchar and (last - first < 8) then -- a somewhat arbitrary criterium local p = P(false) for i=first,last do p = p + P(utfchar(i)) end return p -- nil when invalid range else local f = function(b) return b >= first and b <= last end -- tricky, these nested captures return p_utf8byte / f -- nil when invalid range end end -- print(lpeg.match(lpeg.Cs((C(lpeg.UR("αω"))/{ ["χ"] = "OEPS" })^0),"αωχαω")) end