-- char-def now contains all necessary fields, no need for a custom file
if not characters then
  require "char-def"
end

if not unicode then require('unicode') end
unicode.conformance = unicode.conformance or { }

unicharacters = unicharacters or {}
uni = unicode.utf8
unidata = characters.data

function printf(s, ...) print(string.format(s, ...)) end
-- function debug(s, ...) io.write("DEBUG: ", string.format(s, ...), "\n") end
function warn(s, ...) io.write("Warning: ", string.format(s, ...), "\n") end

function md5sum(any) return md5.hex(md5.sum(any)) end

-- Rehash the character data
unicharacters.combinee = { }
unicharacters.context = unicharacters.context or { }
local charu = unicode.utf8.char

function unicharacters.context.rehash2()
  for ucode, udata in pairs(unidata) -- *not* ipairs :-)
  do
    local sp = udata.specials
    if sp then
      if sp[1] == 'char' then
        -- local ucode = udata.unicodeslot
        local entry = { combinee = sp[2], combining = sp[3], combined = ucode }
        if not unicharacters.combinee[sp[2]]
        then unicharacters.combinee[sp[2]] = { } end
        local n = #unicharacters.combinee[sp[2]]
        unicharacters.combinee[sp[2]][n+1] = entry
      end
    end
    -- copy context's combining field to combclass field
    -- this field was in the custom copy of the char-def.lua that we no longer use
    udata.combclass = udata.combining
  end
end


unicharacters.context.rehash2()
combdata = unicharacters.combinee

--[[ function unicode.conformance.is_hangul(ucode)
  return ucode >= 0xAC00 and ucode <= 0xD7A3
end ]] -- Make it local for the moment
local function is_hangul(char)
  return char >= 0xAC00 and char <= 0xD7A3
end

local function is_jamo(char)
  if char < 0x1100 then return false
  elseif char < 0x1160 then return 'choseong'
  elseif char < 0x11A8 then return 'jungseong'
  elseif char < 0x11FA then return 'jongseong'
  else return false
  end
end

local function decompose(ucode, compat) -- if compat then compatibility
  local invbuf = { }
  local sp = unidata[ucode].specials
  if not sp
  then return { ucode } else
    if compat then compat = (sp[1] == 'compat') else compat = false end
    while sp[1] == 'char' or compat do
      head, tail = sp[2], sp[3]
      if not tail then invbuf[#invbuf + 1] = head break end -- singleton
      invbuf[#invbuf + 1] = tail
        sp = unidata[head].specials
        if not sp then invbuf[#invbuf + 1] = head sp = { } end
      -- end -- not unidata[head]
    end -- while sp[1] == 'char' or compat
  end -- not sp

  local seq = { }
  for i = #invbuf, 1, -1
  do seq[#seq + 1] = invbuf[i]
  end
  return seq
end

local function canon(seq) -- Canonical reordering
  if #seq < 3 then return seq end
  local c1, c2, buf
  -- I'd never thought I'd implement an actual bubble sort some day ;-)
  for k = #seq - 1, 1, -1 do
    for i = 2, k do -- was k - 1!  Argh!
      c1 = unidata[seq[i]].combclass
      c2 = unidata[seq[i+1]].combclass
      if c1 and c2 then
        if c1 > c2 then
          buf = seq[i]
          seq[i] = seq[i+1]
          seq[i+1] = buf
        end
      end
    end
  end
  return seq
end

if not math.div then -- from l-math.lua
  function math.div(n, m)
    return math.floor(n/m)
  end
end

local SBase, LBase, VBase, TBase = 0xAC00, 0x1100, 0x1161, 0x11A7
local LCount, VCount, TCount = 19, 21, 28
local NCount = VCount * TCount
local SCount = TCount * NCount

local function decompose_hangul(ucode) -- assumes input is really a Hangul
  local SIndex = ucode - SBase
  local L = LBase + math.div(SIndex, NCount)
  local V = VBase + math.div((SIndex % NCount), TCount)
  local T = TBase + SIndex % TCount
  if T == TBase then T = nil end
  return { L, V, T }
end

-- To NFK?D.
function toNF_D_or_KD(unistring, compat)
  local nfd, seq = { }, { }
  for uchar in uni.gmatch(unistring, '.') do
    local ucode = uni.byte(uchar)
    if is_hangul(ucode) then
      seq = decompose_hangul(ucode)
      for  _, c in ipairs(seq)
      do nfd[#nfd + 1] = c
      end
      seq = { }
    elseif not unidata[ucode]
    then nfd[#nfd + 1] = ucode else
      local ccc = unidata[ucode].combclass
      if not ccc or ccc == 0 then
        seq = canon(seq)
        for _, c in ipairs(seq) do nfd[#nfd + 1] = c end
        seq = decompose(ucode, compat)
      else seq[#seq + 1] = ucode
      end -- not ccc or ccc == 0
    end -- if is_hangul(ucode) / elseif not unidata[ucode]
  end -- for uchar in uni.gmatch(unistring, ".")

  if #seq > 0 then
    seq = canon(seq)
    for _, c in ipairs(seq) do nfd[#nfd + 1] = c end
  end

  local nfdstr = ""
  for _, chr in ipairs(nfd)
  do nfdstr = string.format("%s%s", nfdstr, uni.char(chr)) end
  return nfdstr, nfd
end

function unicode.conformance.toNFD(unistring)
  return toNF_D_or_KD(unistring, false)
end

function unicode.conformance.toNFKD(unistring)
  return toNF_D_or_KD(unistring, true)
end

local function compose(seq)
  local base = seq[1]
  if not combdata[base] then return seq else
    local i = 2
    while i <= #seq do -- can I play with 'i' in a for loop?
      local cbng = seq[i]
      local cccprev
      if unidata[seq[i-1]] then cccprev = unidata[seq[i-1]].combclass end
      if not cccprev then cccprev = -1 end
      if unidata[cbng].combclass > cccprev then
        if not combdata[base] then return seq else
          for _, cbdata in ipairs(combdata[base]) do
            if cbdata.combining == cbng then
              seq[1] = cbdata.combined
              base = seq[1]
              for k = i, #seq - 1
              do seq[k] = seq[k+1]
              end -- for k = i, #seq - 1
              seq[#seq] = nil
              i = i - 1
            end -- if cbdata.combining == cbng
          end -- for _, cbdata in ipairs(combdata[base])
        end -- if unidata[cbng.combclass > cccprev
      end -- if not combdata[base]
    i = i + 1
    end -- while i <= #seq
  end -- if not combdata[base]
  return seq
end

-- To NFC from NFD.
-- Does not yet take all the composition exclusions in account
-- (missing types 1 and 2 as defined by UAX #15 X6)
function unicode.conformance.toNFC_fromNFD(nfd)
  local nfc = { }
  local seq = { }
  for uchar in uni.gmatch(nfd, '.') do
    local ucode = uni.byte(uchar)
    if not unidata[ucode]
    then nfc[#nfc + 1] = ucode else
      local cb = unidata[ucode].combclass
      if not cb or (cb == 0) then
        -- if seq ~= { } then -- Dubious ...
        if #seq > 0 then
          seq = compose(seq) -- There was a check for #seq == 1 here
          for i = 1, #seq do nfc[#nfc + 1] = seq[i] end
        end -- #seq > 0
        seq = { ucode } 
      else seq[#seq + 1] = ucode --[[ Maybe check if seq is not empty ... ]]
      end -- not cb or cb == 0
    end
  end

  seq = compose(seq)
  for i = 1, #seq do nfc[#nfc + 1] = seq[i] end

  local nfcstr = ""
  for _, chr in ipairs(nfc)
  do nfcstr = string.format("%s%s", nfcstr, uni.char(chr)) end
  return nfcstr, nfc
end

local function cancompose(seq, compat)
  local dec = { } -- new table to hold the decomposed sequence

  local shift
  if #seq >= 2 then -- let's do it the brutal way :-)
    if is_jamo(seq[1]) == 'choseong' and
       is_jamo(seq[2]) == 'jungseong' then
      LIndex = seq[1] - LBase
      VIndex = seq[2] - VBase
      if #seq == 2 or is_jamo(seq[3]) ~= 'jongseong' then
        TIndex = 0
        shift = 1
      else
        TIndex = seq[3] - TBase
        shift = 2
      end
      seq[1] = (LIndex * VCount + VIndex) * TCount + TIndex + SBase
      for i = 2, #seq -- this shifts and shrinks the table at the same time
      do seq[i] = seq[i + shift]
      end
    end
  end

  dec[1] = seq[1]
  for i = 2, #seq do
    local u = seq[i]
    local sp = unidata[u].specials
    if sp then
      if compat then compat = (sp[1] == 'compat') else compat = false end
      if (sp[1] == 'char') or compat then
        for i = 2, #sp
        do dec[#dec + 1] = sp[i]
        end
      end
    else dec[#dec + 1] = u
    end
  end -- we have the fully decomposed sequence; now sort it

  for i = #dec - 1, 2, -1 do -- bubble sort!
    for j = 2, #dec - 1 do
      local u = dec[j]
      local ccc1 = unidata[u].combclass
      local v = dec[j+1]
      local ccc2 = unidata[v].combclass
      if ccc1 > ccc2 then -- swap
        dec[j+1] = u
        dec[j] = v
      end
    end
  end -- dec sorted; now recursively compose

  local base, i, n = dec[1], 2, #dec
  local cbd = combdata[base]
  local incr_i = true
  while i <= n do
    local cbg = dec[i]
    if cbd then
      for _, cb in ipairs(cbd) do
        if cb.combining == cbg then
      -- NO :-) -- if cbd[cbg] then -- base and cbg combine; compose
          dec[1] = cb.combined
          base = dec[1]
          cbd = combdata[base]
          for j = i, n-1 -- shift table elements right of i
          do dec[j] = dec[j+1] end
          dec[n] = nil
          n = n-1 -- table has shrunk by 1, and i doesn't grow
          incr_i = false
        end
      end
    end
    if incr_i then i = i + 1
    else incr_i = true end
  end -- we're finally through! return
  return dec
end

function toNF_C_or_KC(unistring, compat)
  if unistring == "" then return "" end
  local nfc, seq = "", { }
  local start, space = true, ""
  for uchar in uni.gmatch(unistring, '.') do
    local ucode = uni.byte(uchar)
    if start then space = ", " start = true end
    if not unidata[ucode] then -- unknown to the UCD, will not compose
      nfc = string.format("%s%s", nfc, uchar)
    else
      local ccc = unidata[ucode].combclass 
      if not (ccc or is_jamo(ucode) == 'jongseong'
              or is_jamo(ucode) == 'jungseong')
         or ccc == 0 or (is_jamo(ucode) == 'choseong') then
      -- and is actually good :-) -- Well, yes and no ;-)
        if #seq == 0 then -- add ucode and go to next item of the loop
          seq = { ucode }
        else -- seq contains unicharacters, try and compose them
          if #seq == 1 then nfc = string.format("%s%s", nfc, uni.char(seq[1]))
          else dec = cancompose(seq, compat)
            for _, c in ipairs(dec) -- add the whole sequence to nfc
            do nfc = string.format("%s%s", nfc, uni.char(c)) end
          end
          seq = { ucode } -- don't forget to reinitialize seq with current char
        end
      else -- not ccc or ccc == 0 and is_choseong:
           -- character is combining, add it to seq
        seq[#seq + 1] = ucode
      end
    end
  end
  if #seq > 0 then dec = cancompose(seq, compat) end
  for _, c in ipairs(dec)
  do nfc = string.format("%s%s", nfc, uni.char(c)) end
  return nfc
end

function unicode.conformance.toNFC(unistring)
  return toNF_C_or_KC(unistring, false)
end

function unicode.conformance.toNFKC(unistring)
  return toNF_C_or_KC(unistring, true)
end