8
votes

How can I make os.remove and os.rename work with filenames that contain unicode characters, using only the stock Lua 5.3?

filename = "C:\\τέστ.txt"
os.rename(filename, filename .. "1")

This doesn't do anything.

I tried this too but still does not work:

filename = "C:\\τέστ.txt"
t = {}
for p, c in utf8.codes(filename) do 
  t[#t+1] = c
end
filename = "\\" .. table.concat(t, "\\")
os.rename(filename, filename .. "1")

Any ideas? Thank you very much in advance for your help! :)

5
The first piece of code renames the file to C:\\τέστ.txt1. I'm not sure this works in Windows. It does in macOS.lhf
Thanks for trying it! I am in Windows 7 and here it doesn't. I am interested in a solution that would work for both Win and OSX.lucullus
On Windows, you must convert your filenames from UTF-8 to windows-1253 codepage prior to invoking os.rename and os.remove.Egor Skriptunoff

5 Answers

4
votes

As others noted, you won't be able to do much with the stock version of Lua, as it's using CreateFileA and not the unicode version of this function (CreateFileW). If you can load external modules, you can use winapi, as it supports retrieval of "short" file names:

local ok, winapi = pcall(require, "winapi")
if ok then
  winapi.set_encoding(winapi.CP_UTF8)
  local shortpath = winapi.short_path(filepath)
  if shortpath ~= filepath then
    -- have the short path
  end
end

This code should work on all platforms (as it will fail to load winapi on macOS and Linux where this conversion is not needed). The conversion may still fail if the short filenames are not available and it can be configured (per drive) in Windows using fsutil 8dot3name set DRIVE: 0 command.

The renaming will work if you run both the source and the destination file names through the conversion (and delete the destination file, as it may be created by short_file call).

3
votes

As lhf pointed out, your code works fine on MacOS.
All you need is a correction for Windows.

The following code is written in pure Lua; it redefines standard os/io functions so that they work in Windows with UTF-8 filenames.
Please note that your Windows locale must be Greek and all your filenames must contain only symbols from Windows Greek codepage. On Windows in pure Lua, you can't open a file containing arbitrary UTF-8 symbols in its name.

if (os.getenv"os" or ""):match"^Windows" then

   local map_unicode_to_1253 = {
      [0x20AC] = 0x80,
      [0x201A] = 0x82,
      [0x0192] = 0x83,
      [0x201E] = 0x84,
      [0x2026] = 0x85,
      [0x2020] = 0x86,
      [0x2021] = 0x87,
      [0x2030] = 0x89,
      [0x2039] = 0x8B,
      [0x2018] = 0x91,
      [0x2019] = 0x92,
      [0x201C] = 0x93,
      [0x201D] = 0x94,
      [0x2022] = 0x95,
      [0x2013] = 0x96,
      [0x2014] = 0x97,
      [0x2122] = 0x99,
      [0x203A] = 0x9B,
      [0x00A0] = 0xA0,
      [0x0385] = 0xA1,
      [0x0386] = 0xA2,
      [0x00A3] = 0xA3,
      [0x00A4] = 0xA4,
      [0x00A5] = 0xA5,
      [0x00A6] = 0xA6,
      [0x00A7] = 0xA7,
      [0x00A8] = 0xA8,
      [0x00A9] = 0xA9,
      [0x00AB] = 0xAB,
      [0x00AC] = 0xAC,
      [0x00AD] = 0xAD,
      [0x00AE] = 0xAE,
      [0x2015] = 0xAF,
      [0x00B0] = 0xB0,
      [0x00B1] = 0xB1,
      [0x00B2] = 0xB2,
      [0x00B3] = 0xB3,
      [0x0384] = 0xB4,
      [0x00B5] = 0xB5,
      [0x00B6] = 0xB6,
      [0x00B7] = 0xB7,
      [0x0388] = 0xB8,
      [0x0389] = 0xB9,
      [0x038A] = 0xBA,
      [0x00BB] = 0xBB,
      [0x038C] = 0xBC,
      [0x00BD] = 0xBD,
      [0x038E] = 0xBE,
      [0x038F] = 0xBF,
      [0x0390] = 0xC0,
      [0x0391] = 0xC1,
      [0x0392] = 0xC2,
      [0x0393] = 0xC3,
      [0x0394] = 0xC4,
      [0x0395] = 0xC5,
      [0x0396] = 0xC6,
      [0x0397] = 0xC7,
      [0x0398] = 0xC8,
      [0x0399] = 0xC9,
      [0x039A] = 0xCA,
      [0x039B] = 0xCB,
      [0x039C] = 0xCC,
      [0x039D] = 0xCD,
      [0x039E] = 0xCE,
      [0x039F] = 0xCF,
      [0x03A0] = 0xD0,
      [0x03A1] = 0xD1,
      [0x03A3] = 0xD3,
      [0x03A4] = 0xD4,
      [0x03A5] = 0xD5,
      [0x03A6] = 0xD6,
      [0x03A7] = 0xD7,
      [0x03A8] = 0xD8,
      [0x03A9] = 0xD9,
      [0x03AA] = 0xDA,
      [0x03AB] = 0xDB,
      [0x03AC] = 0xDC,
      [0x03AD] = 0xDD,
      [0x03AE] = 0xDE,
      [0x03AF] = 0xDF,
      [0x03B0] = 0xE0,
      [0x03B1] = 0xE1,
      [0x03B2] = 0xE2,
      [0x03B3] = 0xE3,
      [0x03B4] = 0xE4,
      [0x03B5] = 0xE5,
      [0x03B6] = 0xE6,
      [0x03B7] = 0xE7,
      [0x03B8] = 0xE8,
      [0x03B9] = 0xE9,
      [0x03BA] = 0xEA,
      [0x03BB] = 0xEB,
      [0x03BC] = 0xEC,
      [0x03BD] = 0xED,
      [0x03BE] = 0xEE,
      [0x03BF] = 0xEF,
      [0x03C0] = 0xF0,
      [0x03C1] = 0xF1,
      [0x03C2] = 0xF2,
      [0x03C3] = 0xF3,
      [0x03C4] = 0xF4,
      [0x03C5] = 0xF5,
      [0x03C6] = 0xF6,
      [0x03C7] = 0xF7,
      [0x03C8] = 0xF8,
      [0x03C9] = 0xF9,
      [0x03CA] = 0xFA,
      [0x03CB] = 0xFB,
      [0x03CC] = 0xFC,
      [0x03CD] = 0xFD,
      [0x03CE] = 0xFE,
   }
   local char, byte, table_insert, table_concat = string.char, string.byte, table.insert, table.concat

   local function utf8_to_unicode(utf8str, pos)
      -- pos = starting byte position inside input string (default 1)
      pos = pos or 1
      local code, size = byte(utf8str, pos), 1
      if code >= 0xC0 and code < 0xFE then
         local mask = 64
         code = code - 128
         repeat
            local next_byte = byte(utf8str, pos + size) or 0
            if next_byte >= 0x80 and next_byte < 0xC0 then
               code, size = (code - mask - 2) * 64 + next_byte, size + 1
            else
               code, size = byte(utf8str, pos), 1
            end
            mask = mask * 32
         until code < mask
      end
      -- returns code, number of bytes in this utf8 char
      return code, size
   end

   local function utf8_to_1253(utf8str)
      local pos, result_1253 = 1, {}
      while pos <= #utf8str do
         local code, size = utf8_to_unicode(utf8str, pos)
         pos = pos + size
         code = code < 128 and code or map_unicode_to_1253[code] or byte('?')
         table_insert(result_1253, char(code))
      end
      return table_concat(result_1253)
   end

   local orig_os_rename = os.rename

   function os.rename(old, new)
      return orig_os_rename(utf8_to_1253(old), utf8_to_1253(new))
   end

   local orig_os_remove = os.remove

   function os.remove(filename)
      return orig_os_remove(utf8_to_1253(filename))
   end

   local orig_os_execute = os.execute

   function os.execute(command)
      if command then
         command = utf8_to_1253(command)
      end
      return orig_os_execute(command)
   end

   local orig_io_open = io.open

   function io.open(filename, ...)
      return orig_io_open(utf8_to_1253(filename), ...)
   end

   local orig_io_popen = io.popen

   function io.popen(prog, ...)
      return orig_io_popen(utf8_to_1253(prog), ...)
   end

   local orig_io_lines = io.lines

   function io.lines(filename, ...)
      if filename then
         filename = utf8_to_1253(filename)
      end
      return orig_io_lines(filename, ...)
   end

end

UPDATE:
How to determine Windows codepage:

local function get_windows_ansi_codepage()
   local pipe = assert(io.popen[[reg query HKLM\SYSTEM\CurrentControlSet\Control\Nls\CodePage /v ACP]])
   local codepage = pipe:read"*a":match"%sACP%s+REG_SZ%s+(.-)%s*$"
   pipe:close()
   return codepage  -- returns string "1253"
end
3
votes

In the standard implementation, os.rename calls the C function rename which in turn goes to CreateFileA on Windows. This function is only supposed to work with ANSI strings and performs internally a conversion on the string from ANSI to Unicode.

This uses the system ANSI codepage specified by your locale settings. If your system is set to Greek, it may use codepage 1253 which defines Greek characters, but characters from other codepages aren't available.

I am not sure, but if the system codepage is set to 65001 (UTF-8), it may allow you to use UTF-8. There is also an application called AppLocale that can set it for a specific application only.

If you can use external libraries, there seem to be some that can use wide characters in system API calls.

2
votes

I also tried to use non-ASCII file names in unmodified Lua 5.3 in Windows, and it didn't work. I think it requires a modified version of Lua. My understanding is that Lua uses basic C functions for filenames, commands, and environment variables, but Windows uses the UTF-16 encoding and requires you to use the wide-string (which means UTF-16 on Windows) functions for non-ASCII filenames, commands, and environment variables.

There is a modified version of Lua that I compiled and tried out, and it handled a non-ASCII filename just fine: lua-u8w. It uses wide-string versions of various functions dealing with files and so on, and converts from UTF-8 to UTF-16 and back so that you can use UTF-8 in Lua while UTF-16 is used in dealing with the Windows operating system.

1
votes

Egor Skriptunoff's code completely solved my problem. I modified slightly his code so that other mapping tables can be inserted and the correct mapping will be used according to the locale.

Thank you all for your help! :)

if (os.getenv"os" or ""):match"^Windows" then

  local char, byte, table_insert, table_concat = string.char, string.byte, table.insert, table.concat

  -- TABLES OF CODEPAGES

  local cp1253 = { -- GREEK
    [0x20AC] = 0x80,  -- EURO SIGN
    [0x201A] = 0x82,  -- SINGLE LOW-9 QUOTATION MARK
    [0x0192] = 0x83,  -- LATIN SMALL LETTER F WITH HOOK
    [0x201E] = 0x84,  -- DOUBLE LOW-9 QUOTATION MARK
    [0x2026] = 0x85,  -- HORIZONTAL ELLIPSIS
    [0x2020] = 0x86,  -- DAGGER
    [0x2021] = 0x87,  -- DOUBLE DAGGER
    [0x2030] = 0x89,  -- PER MILLE SIGN
    [0x2039] = 0x8B,  -- SINGLE LEFT-POINTING ANGLE QUOTATION MARK
    [0x2018] = 0x91,  -- LEFT SINGLE QUOTATION MARK
    [0x2019] = 0x92,  -- RIGHT SINGLE QUOTATION MARK
    [0x201C] = 0x93,  -- LEFT DOUBLE QUOTATION MARK
    [0x201D] = 0x94,  -- RIGHT DOUBLE QUOTATION MARK
    [0x2022] = 0x95,  -- BULLET
    [0x2013] = 0x96,  -- EN DASH
    [0x2014] = 0x97,  -- EM DASH
    [0x2122] = 0x99,  -- TRADE MARK SIGN
    [0x203A] = 0x9B,  -- SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
    [0x00A0] = 0xA0,  -- NO-BREAK SPACE
    [0x0385] = 0xA1,  -- GREEK DIALYTIKA TONOS
    [0x0386] = 0xA2,  -- GREEK CAPITAL LETTER ALPHA WITH TONOS
    [0x00A3] = 0xA3,  -- POUND SIGN
    [0x00A4] = 0xA4,  -- CURRENCY SIGN
    [0x00A5] = 0xA5,  -- YEN SIGN
    [0x00A6] = 0xA6,  -- BROKEN BAR
    [0x00A7] = 0xA7,  -- SECTION SIGN
    [0x00A8] = 0xA8,  -- DIAERESIS
    [0x00A9] = 0xA9,  -- COPYRIGHT SIGN
    [0x00AB] = 0xAB,  -- LEFT-POINTING DOUBLE ANGLE QUOTATION MARK
    [0x00AC] = 0xAC,  -- NOT SIGN
    [0x00AD] = 0xAD,  -- SOFT HYPHEN
    [0x00AE] = 0xAE,  -- REGISTERED SIGN
    [0x2015] = 0xAF,  -- HORIZONTAL BAR
    [0x00B0] = 0xB0,  -- DEGREE SIGN
    [0x00B1] = 0xB1,  -- PLUS-MINUS SIGN
    [0x00B2] = 0xB2,  -- SUPERSCRIPT TWO
    [0x00B3] = 0xB3,  -- SUPERSCRIPT THREE
    [0x0384] = 0xB4,  -- GREEK TONOS
    [0x00B5] = 0xB5,  -- MICRO SIGN
    [0x00B6] = 0xB6,  -- PILCROW SIGN
    [0x00B7] = 0xB7,  -- MIDDLE DOT
    [0x0388] = 0xB8,  -- GREEK CAPITAL LETTER EPSILON WITH TONOS
    [0x0389] = 0xB9,  -- GREEK CAPITAL LETTER ETA WITH TONOS
    [0x038A] = 0xBA,  -- GREEK CAPITAL LETTER IOTA WITH TONOS
    [0x00BB] = 0xBB,  -- RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK
    [0x038C] = 0xBC,  -- GREEK CAPITAL LETTER OMICRON WITH TONOS
    [0x00BD] = 0xBD,  -- VULGAR FRACTION ONE HALF
    [0x038E] = 0xBE,  -- GREEK CAPITAL LETTER UPSILON WITH TONOS
    [0x038F] = 0xBF,  -- GREEK CAPITAL LETTER OMEGA WITH TONOS
    [0x0390] = 0xC0,  -- GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
    [0x0391] = 0xC1,  -- GREEK CAPITAL LETTER ALPHA
    [0x0392] = 0xC2,  -- GREEK CAPITAL LETTER BETA
    [0x0393] = 0xC3,  -- GREEK CAPITAL LETTER GAMMA
    [0x0394] = 0xC4,  -- GREEK CAPITAL LETTER DELTA
    [0x0395] = 0xC5,  -- GREEK CAPITAL LETTER EPSILON
    [0x0396] = 0xC6,  -- GREEK CAPITAL LETTER ZETA
    [0x0397] = 0xC7,  -- GREEK CAPITAL LETTER ETA
    [0x0398] = 0xC8,  -- GREEK CAPITAL LETTER THETA
    [0x0399] = 0xC9,  -- GREEK CAPITAL LETTER IOTA
    [0x039A] = 0xCA,  -- GREEK CAPITAL LETTER KAPPA
    [0x039B] = 0xCB,  -- GREEK CAPITAL LETTER LAMDA
    [0x039C] = 0xCC,  -- GREEK CAPITAL LETTER MU
    [0x039D] = 0xCD,  -- GREEK CAPITAL LETTER NU
    [0x039E] = 0xCE,  -- GREEK CAPITAL LETTER XI
    [0x039F] = 0xCF,  -- GREEK CAPITAL LETTER OMICRON
    [0x03A0] = 0xD0,  -- GREEK CAPITAL LETTER PI
    [0x03A1] = 0xD1,  -- GREEK CAPITAL LETTER RHO
    [0x03A3] = 0xD3,  -- GREEK CAPITAL LETTER SIGMA
    [0x03A4] = 0xD4,  -- GREEK CAPITAL LETTER TAU
    [0x03A5] = 0xD5,  -- GREEK CAPITAL LETTER UPSILON
    [0x03A6] = 0xD6,  -- GREEK CAPITAL LETTER PHI
    [0x03A7] = 0xD7,  -- GREEK CAPITAL LETTER CHI
    [0x03A8] = 0xD8,  -- GREEK CAPITAL LETTER PSI
    [0x03A9] = 0xD9,  -- GREEK CAPITAL LETTER OMEGA
    [0x03AA] = 0xDA,  -- GREEK CAPITAL LETTER IOTA WITH DIALYTIKA
    [0x03AB] = 0xDB,  -- GREEK CAPITAL LETTER UPSILON WITH DIALYTIKA
    [0x03AC] = 0xDC,  -- GREEK SMALL LETTER ALPHA WITH TONOS
    [0x03AD] = 0xDD,  -- GREEK SMALL LETTER EPSILON WITH TONOS
    [0x03AE] = 0xDE,  -- GREEK SMALL LETTER ETA WITH TONOS
    [0x03AF] = 0xDF,  -- GREEK SMALL LETTER IOTA WITH TONOS
    [0x03B0] = 0xE0,  -- GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
    [0x03B1] = 0xE1,  -- GREEK SMALL LETTER ALPHA
    [0x03B2] = 0xE2,  -- GREEK SMALL LETTER BETA
    [0x03B3] = 0xE3,  -- GREEK SMALL LETTER GAMMA
    [0x03B4] = 0xE4,  -- GREEK SMALL LETTER DELTA
    [0x03B5] = 0xE5,  -- GREEK SMALL LETTER EPSILON
    [0x03B6] = 0xE6,  -- GREEK SMALL LETTER ZETA
    [0x03B7] = 0xE7,  -- GREEK SMALL LETTER ETA
    [0x03B8] = 0xE8,  -- GREEK SMALL LETTER THETA
    [0x03B9] = 0xE9,  -- GREEK SMALL LETTER IOTA
    [0x03BA] = 0xEA,  -- GREEK SMALL LETTER KAPPA
    [0x03BB] = 0xEB,  -- GREEK SMALL LETTER LAMDA
    [0x03BC] = 0xEC,  -- GREEK SMALL LETTER MU
    [0x03BD] = 0xED,  -- GREEK SMALL LETTER NU
    [0x03BE] = 0xEE,  -- GREEK SMALL LETTER XI
    [0x03BF] = 0xEF,  -- GREEK SMALL LETTER OMICRON
    [0x03C0] = 0xF0,  -- GREEK SMALL LETTER PI
    [0x03C1] = 0xF1,  -- GREEK SMALL LETTER RHO
    [0x03C2] = 0xF2,  -- GREEK SMALL LETTER FINAL SIGMA
    [0x03C3] = 0xF3,  -- GREEK SMALL LETTER SIGMA
    [0x03C4] = 0xF4,  -- GREEK SMALL LETTER TAU
    [0x03C5] = 0xF5,  -- GREEK SMALL LETTER UPSILON
    [0x03C6] = 0xF6,  -- GREEK SMALL LETTER PHI
    [0x03C7] = 0xF7,  -- GREEK SMALL LETTER CHI
    [0x03C8] = 0xF8,  -- GREEK SMALL LETTER PSI
    [0x03C9] = 0xF9,  -- GREEK SMALL LETTER OMEGA
    [0x03CA] = 0xFA,  -- GREEK SMALL LETTER IOTA WITH DIALYTIKA
    [0x03CB] = 0xFB,  -- GREEK SMALL LETTER UPSILON WITH DIALYTIKA
    [0x03CC] = 0xFC,  -- GREEK SMALL LETTER OMICRON WITH TONOS
    [0x03CD] = 0xFD,  -- GREEK SMALL LETTER UPSILON WITH TONOS
    [0x03CE] = 0xFE,  -- GREEK SMALL LETTER OMEGA WITH TONOS
    }

  local locale = tonumber(string.match(os.setlocale(), "(%d+)$"))
  local CODEPAGE

  -- Use appropriate locale
  if locale == 1253 then -- GREEK
    CODEPAGE = cp1253
  elseif locale == 1254 then -- TURKISH
    CODEPAGE = cp1254
  elseif locale == 1255 then -- HEBREW
    CODEPAGE = cp1255
  -- etc
  end

  local function utf8_to_unicode(utf8str, pos)
    -- pos = starting byte position inside input string (default 1)
    pos = pos or 1
    local code, size = byte(utf8str, pos), 1
    if code >= 0xC0 and code < 0xFE then
      local mask = 64
      code = code - 128
      repeat
        local next_byte = byte(utf8str, pos + size) or 0
        if next_byte >= 0x80 and next_byte < 0xC0 then
          code, size = (code - mask - 2) * 64 + next_byte, size + 1
        else
          code, size = byte(utf8str, pos), 1
        end
        mask = mask * 32
      until code < mask
    end
    -- returns code, number of bytes in this utf8 char
    return code, size
  end

  local function utf8_to_codepage(utf8str)
    local pos, result_codepage = 1, {}
    while pos <= #utf8str do
      local code, size = utf8_to_unicode(utf8str, pos)
      pos = pos + size
      code = code < 128 and code or CODEPAGE[code] or byte('?')
      table_insert(result_codepage, char(code))
    end
    return table_concat(result_codepage)
  end

  local orig_os_rename = os.rename

  function os.rename(old, new)
    return orig_os_rename(utf8_to_codepage(old), utf8_to_codepage(new))
  end

  local orig_os_remove = os.remove

  function os.remove(filename)
    return orig_os_remove(utf8_to_codepage(filename))
  end

  local orig_os_execute = os.execute

  function os.execute(command)
    if command then
       command = utf8_to_codepage(command)
    end
    return orig_os_execute(command)
  end

  local orig_io_open = io.open

  function io.open(filename, ...)
    return orig_io_open(utf8_to_codepage(filename), ...)
  end

  local orig_io_popen = io.popen

  function io.popen(prog, ...)
    return orig_io_popen(utf8_to_codepage(prog), ...)
  end

  local orig_io_lines = io.lines

  function io.lines(filename, ...)
    if filename then
       filename = utf8_to_codepage(filename)
    end
    return orig_io_lines(filename, ...)
  end

end