Commit 7af0b906 authored by Andreas Klebinger's avatar Andreas Klebinger Committed by Tamar Christina

Initialize hs_init with UTF8 encoded arguments on Windows.

Summary:
Get utf8 encoded arguments before we call hs_init and use them
instead of ignoring hs_init arguments. This reduces differing
behaviour of the RTS between windows and linux and simplifies
the code involved.

A few testcases were changed to expect the same result on windows
as on linux after the changes.

This fixes #13940.

Test Plan: ./validate

Reviewers: austin, hvr, bgamari, erikd, simonmar, Phyx

Subscribers: Phyx, rwbarton, thomie

GHC Trac Issues: #13940

Differential Revision: https://phabricator.haskell.org/D3739
parent 791947db
......@@ -337,6 +337,12 @@ reliably re-initialise after this has happened; see :ref:`infelicities-ffi`.
don't forget the flag :ghc-flag:`-no-hs-main`, otherwise GHC
will try to link to the ``Main`` Haskell module.
.. note::
On Windows hs_init treats argv as UTF8-encoded. Passing other encodings
might lead to unexpected results. Passing NULL as argv is valid but can
lead to <unknown> showing up in error messages instead of the name of the
executable.
To use ``+RTS`` flags with ``hs_init()``, we have to modify the example
slightly. By default, GHC's RTS will only accept "safe" ``+RTS`` flags (see
:ref:`options-linker`), and the :ghc-flag:`-rtsopts[=⟨none|some|all⟩]`
......
......@@ -211,12 +211,6 @@ DLL_IMPORT_RTS extern char **prog_argv; /* so we can get at these from Haskell *
DLL_IMPORT_RTS extern int prog_argc;
DLL_IMPORT_RTS extern char *prog_name;
#if defined(mingw32_HOST_OS)
// We need these two from Haskell too
void getWin32ProgArgv(int *argc, wchar_t **argv[]);
void setWin32ProgArgv(int argc, wchar_t *argv[]);
#endif
void reportStackOverflow(StgTSO* tso);
void reportHeapOverflow(void);
......
......@@ -8,11 +8,10 @@ import Foreign
import Foreign.C
import GHC.Base
import GHC.Real ( fromIntegral )
import GHC.IO.Encoding
import qualified GHC.Foreign as GHC
#if defined(mingw32_HOST_OS)
import GHC.IO (finally)
import GHC.Windows
# if defined(i386_HOST_ARCH)
# define WINDOWS_CCONV stdcall
# elif defined(x86_64_HOST_ARCH)
......@@ -20,9 +19,6 @@ import GHC.Windows
# else
# error Unknown mingw32 arch
# endif
#else
import GHC.IO.Encoding
import qualified GHC.Foreign as GHC
#endif
-- | Computation 'getFullArgs' is the "raw" version of 'getArgs', similar
......@@ -30,37 +26,14 @@ import qualified GHC.Foreign as GHC
-- command line arguments, starting with the program name, and
-- including those normally eaten by the RTS (+RTS ... -RTS).
getFullArgs :: IO [String]
#if defined(mingw32_HOST_OS)
-- Ignore the arguments to hs_init on Windows for the sake of Unicode compat
getFullArgs = do
p_arg_string <- c_GetCommandLine
alloca $ \p_argc -> do
p_argv <- c_CommandLineToArgv p_arg_string p_argc
if p_argv == nullPtr
then throwGetLastError "getFullArgs"
else flip finally (c_LocalFree p_argv) $ do
argc <- peek p_argc
p_argvs <- peekArray (fromIntegral argc) p_argv
mapM peekCWString p_argvs
foreign import WINDOWS_CCONV unsafe "windows.h GetCommandLineW"
c_GetCommandLine :: IO (Ptr CWString)
foreign import WINDOWS_CCONV unsafe "windows.h CommandLineToArgvW"
c_CommandLineToArgv :: Ptr CWString -> Ptr CInt -> IO (Ptr CWString)
foreign import WINDOWS_CCONV unsafe "Windows.h LocalFree"
c_LocalFree :: Ptr a -> IO (Ptr a)
#else
getFullArgs =
alloca $ \ p_argc ->
alloca $ \ p_argv -> do
getFullProgArgv p_argc p_argv
p <- fromIntegral `liftM` peek p_argc
argv <- peek p_argv
enc <- getFileSystemEncoding
peekArray p argv >>= mapM (GHC.peekCString enc)
alloca $ \ p_argc -> do
alloca $ \ p_argv -> do
getFullProgArgv p_argc p_argv
p <- fromIntegral `liftM` peek p_argc
argv <- peek p_argv
enc <- argvEncoding
peekArray p argv >>= mapM (GHC.peekCString enc)
foreign import ccall unsafe "getFullProgArgv"
getFullProgArgv :: Ptr CInt -> Ptr (Ptr CString) -> IO ()
#endif
......@@ -27,6 +27,7 @@ module GHC.IO.Encoding (
setLocaleEncoding, setFileSystemEncoding, setForeignEncoding,
char8,
mkTextEncoding,
argvEncoding
) where
import GHC.Base
......@@ -161,6 +162,17 @@ initFileSystemEncoding = CodePage.mkLocaleEncoding RoundtripFailure
initForeignEncoding = CodePage.mkLocaleEncoding IgnoreCodingFailure
#endif
-- See Note [Windows Unicode Arguments] in rts/RtsFlags.c
-- On Windows we assume hs_init argv is in utf8 encoding.
-- | Internal encoding of argv
argvEncoding :: IO TextEncoding
#if defined(mingw32_HOST_OS)
argvEncoding = return utf8
#else
argvEncoding = getFileSystemEncoding
#endif
-- | An encoding in which Unicode code points are translated to bytes
-- by taking the code point modulo 256. When decoding, bytes are
-- translated directly into the equivalent code point.
......
......@@ -38,13 +38,13 @@ import Control.Exception.Base (bracket)
#endif
-- import GHC.IO
import GHC.IO.Exception
import GHC.IO.Encoding (getFileSystemEncoding)
import qualified GHC.Foreign as GHC
import Control.Monad
#if defined(mingw32_HOST_OS)
import GHC.Environment
import GHC.IO.Encoding (argvEncoding)
import GHC.Windows
#else
import GHC.IO.Encoding (getFileSystemEncoding, argvEncoding)
import System.Posix.Internals (withFilePath)
#endif
......@@ -65,89 +65,21 @@ import System.Environment.ExecutablePath
-- ---------------------------------------------------------------------------
-- getArgs, getProgName, getEnv
#if defined(mingw32_HOST_OS)
{-
Note [Ignore hs_init argv]
~~~~~~~~~~~~~~~~~~~~~~~~~~
Ignore the arguments to hs_init on Windows for the sake of Unicode compat
Instead on Windows we get the list of arguments from getCommandLineW and
filter out arguments which the RTS would not have passed along.
This is done to ensure we get the arguments in proper Unicode Encoding which
the RTS at this moment does not seem provide. The filtering has to match the
one done by the RTS to avoid inconsistencies like #13287.
-}
getWin32ProgArgv_certainly :: IO [String]
getWin32ProgArgv_certainly = do
mb_argv <- getWin32ProgArgv
case mb_argv of
-- see Note [Ignore hs_init argv]
Nothing -> fmap dropRTSArgs getFullArgs
Just argv -> return argv
withWin32ProgArgv :: [String] -> IO a -> IO a
withWin32ProgArgv argv act = bracket begin setWin32ProgArgv (\_ -> act)
where
begin = do
mb_old_argv <- getWin32ProgArgv
setWin32ProgArgv (Just argv)
return mb_old_argv
getWin32ProgArgv :: IO (Maybe [String])
getWin32ProgArgv = alloca $ \p_argc -> alloca $ \p_argv -> do
c_getWin32ProgArgv p_argc p_argv
argc <- peek p_argc
argv_p <- peek p_argv
if argv_p == nullPtr
then return Nothing
else do
argv_ps <- peekArray (fromIntegral argc) argv_p
fmap Just $ mapM peekCWString argv_ps
setWin32ProgArgv :: Maybe [String] -> IO ()
setWin32ProgArgv Nothing = c_setWin32ProgArgv 0 nullPtr
setWin32ProgArgv (Just argv) = withMany withCWString argv $ \argv_ps -> withArrayLen argv_ps $ \argc argv_p -> do
c_setWin32ProgArgv (fromIntegral argc) argv_p
foreign import ccall unsafe "getWin32ProgArgv"
c_getWin32ProgArgv :: Ptr CInt -> Ptr (Ptr CWString) -> IO ()
foreign import ccall unsafe "setWin32ProgArgv"
c_setWin32ProgArgv :: CInt -> Ptr CWString -> IO ()
-- See Note [Ignore hs_init argv]
dropRTSArgs :: [String] -> [String]
dropRTSArgs [] = []
dropRTSArgs rest@("--":_) = rest
dropRTSArgs ("+RTS":rest) = dropRTSArgs (dropWhile (/= "-RTS") rest)
dropRTSArgs ("--RTS":rest) = rest
dropRTSArgs ("-RTS":rest) = dropRTSArgs rest
dropRTSArgs (arg:rest) = arg : dropRTSArgs rest
#endif
-- | Computation 'getArgs' returns a list of the program's command
-- line arguments (not including the program name).
getArgs :: IO [String]
#if defined(mingw32_HOST_OS)
getArgs = fmap tail getWin32ProgArgv_certainly
#else
getArgs =
alloca $ \ p_argc ->
alloca $ \ p_argv -> do
getProgArgv p_argc p_argv
p <- fromIntegral `liftM` peek p_argc
argv <- peek p_argv
enc <- getFileSystemEncoding
enc <- argvEncoding
peekArray (p - 1) (advancePtr argv 1) >>= mapM (GHC.peekCString enc)
foreign import ccall unsafe "getProgArgv"
getProgArgv :: Ptr CInt -> Ptr (Ptr CString) -> IO ()
#endif
{-|
Computation 'getProgName' returns the name of the program as it was
......@@ -160,10 +92,7 @@ between platforms: on Windows, for example, a program invoked as foo
is probably really @FOO.EXE@, and that is what 'getProgName' will return.
-}
getProgName :: IO String
#if defined(mingw32_HOST_OS)
-- Ignore the arguments to hs_init on Windows for the sake of Unicode compat
getProgName = fmap (basename . head) getWin32ProgArgv_certainly
#else
getProgName =
alloca $ \ p_argc ->
alloca $ \ p_argv -> do
......@@ -173,10 +102,9 @@ getProgName =
unpackProgName :: Ptr (Ptr CChar) -> IO String -- argv[0]
unpackProgName argv = do
enc <- getFileSystemEncoding
enc <- argvEncoding
s <- peekElemOff argv 0 >>= GHC.peekCString enc
return (basename s)
#endif
basename :: FilePath -> FilePath
basename f = go f f
......@@ -371,15 +299,7 @@ withProgName nm act = do
-- the duration of an action.
withArgv :: [String] -> IO a -> IO a
#if defined(mingw32_HOST_OS)
-- We have to reflect the updated arguments in the RTS-side variables as
-- well, because the RTS still consults them for error messages and the like.
-- If we don't do this then ghc-e005 fails.
withArgv new_args act = withWin32ProgArgv new_args $ withProgArgv new_args act
#else
withArgv = withProgArgv
#endif
withProgArgv :: [String] -> IO a -> IO a
withProgArgv new_args act = do
......@@ -391,7 +311,7 @@ withProgArgv new_args act = do
setProgArgv :: [String] -> IO ()
setProgArgv argv = do
enc <- getFileSystemEncoding
enc <- argvEncoding
GHC.withCStringsLen enc argv $ \len css ->
c_setProgArgv (fromIntegral len) css
......
......@@ -46,12 +46,11 @@ int rts_argc = 0; /* ditto */
char **rts_argv = NULL;
int rts_argv_size = 0;
#if defined(mingw32_HOST_OS)
// On Windows, we want to use GetCommandLineW rather than argc/argv,
// but we need to mutate the command line arguments for withProgName and
// friends. The System.Environment module achieves that using this bit of
// shared state:
int win32_prog_argc = 0;
wchar_t **win32_prog_argv = NULL;
// On Windows hs_main uses GetCommandLineW to get Unicode arguments and
// passes them along UTF8 encoded as argv. We store them here in order to
// free them on exit.
int win32_full_utf8_argc = 0;
char** win32_utf8_argv = NULL;
#endif
// The global rtsConfig, set from the RtsConfig supplied by the call
......@@ -111,6 +110,9 @@ static void read_trace_flags(const char *arg);
static void errorUsage (void) GNU_ATTRIBUTE(__noreturn__);
#if defined(mingw32_HOST_OS)
static char** win32_full_utf8_argv;
#endif
static char * copyArg (char *arg);
static char ** copyArgv (int argc, char *argv[]);
static void freeArgv (int argc, char *argv[]);
......@@ -446,6 +448,66 @@ usage_text[] = {
0
};
/**
Note [Windows Unicode Arguments]
~~~~~~~~~~~~~~~~~~~~~~~~~~
On Windows argv is usually encoded in the current Codepage which might not
support unicode.
Instead of ignoring the arguments to hs_init we expect them to be utf-8
encoded when coming from a custom main function. In the regular hs_main we
get the unicode arguments from the windows API and pass them along utf8
encoded instead.
This reduces special casing of arguments in later parts of the RTS and base
libraries to dealing with slash differences and using utf8 instead of the
current locale on Windows when decoding arguments.
*/
#if defined(mingw32_HOST_OS)
//Allocate a buffer and return the string utf8 encoded.
char* lpcwstrToUTF8(const wchar_t* utf16_str)
{
//Check the utf8 encoded size first
int res = WideCharToMultiByte(CP_UTF8, 0, utf16_str, -1, NULL, 0,
NULL, NULL);
if (res == 0) {
return NULL;
}
char* buffer = (char*) stgMallocBytes((size_t)res, "getUTF8Args 2");
res = WideCharToMultiByte(CP_UTF8, 0, utf16_str, -1, buffer, res,
NULL, NULL);
return buffer;
}
char** getUTF8Args(int* argc)
{
LPCWSTR cmdLine = GetCommandLineW();
LPWSTR* argvw = CommandLineToArgvW(cmdLine, argc);
// We create two argument arrays, one which is later permutated by the RTS
// instead of the main argv.
// The other one is used to free the allocted memory later.
char** argv = (char**) stgMallocBytes(sizeof(char*) * (*argc + 1),
"getUTF8Args 1");
win32_full_utf8_argv = (char**) stgMallocBytes(sizeof(char*) * (*argc + 1),
"getUTF8Args 1");
for (int i = 0; i < *argc; i++)
{
argv[i] = lpcwstrToUTF8(argvw[i]);
}
argv[*argc] = NULL;
memcpy(win32_full_utf8_argv, argv, sizeof(char*) * (*argc + 1));
LocalFree(argvw);
win32_utf8_argv = argv;
win32_full_utf8_argc = *argc;
return argv;
}
#endif
STATIC_INLINE bool strequal(const char *a, const char * b)
{
return(strcmp(a, b) == 0);
......@@ -514,12 +576,8 @@ static void errorRtsOptsDisabled(const char *s)
- rtsConfig (global) contains the supplied RtsConfig
On Windows getArgs ignores argv and instead takes the arguments directly
from the WinAPI and removes any which would have been parsed by the RTS.
If the handling of which arguments are passed to the Haskell side changes
these changes have to be synchronized with getArgs in base. See #13287 and
Note [Ignore hs_init argv] in System.Environment.
On Windows argv is assumed to be utf8 encoded for unicode compatibility.
See Note [Windows Unicode Arguments]
-------------------------------------------------------------------------- */
......@@ -579,6 +637,7 @@ void setupRtsFlags (int *argc, char *argv[], RtsConfig rts_config)
for (mode = PGM; arg < total_arg; arg++) {
// The '--RTS' argument disables all future +RTS ... -RTS processing.
if (strequal("--RTS", argv[arg])) {
arg++;
break;
}
......@@ -2040,48 +2099,18 @@ void freeWin32ProgArgv (void);
void
freeWin32ProgArgv (void)
{
int i;
if (win32_prog_argv != NULL) {
for (i = 0; i < win32_prog_argc; i++) {
stgFree(win32_prog_argv[i]);
}
stgFree(win32_prog_argv);
if(win32_utf8_argv == NULL) {
return;
}
else
{
freeArgv(win32_full_utf8_argc, win32_full_utf8_argv);
stgFree(win32_utf8_argv);
}
win32_prog_argc = 0;
win32_prog_argv = NULL;
}
void
getWin32ProgArgv(int *argc, wchar_t **argv[])
{
*argc = win32_prog_argc;
*argv = win32_prog_argv;
}
void
setWin32ProgArgv(int argc, wchar_t *argv[])
{
int i;
freeWin32ProgArgv();
win32_prog_argc = argc;
if (argv == NULL) {
win32_prog_argv = NULL;
return;
}
win32_prog_argv = stgCallocBytes(argc + 1, sizeof (wchar_t *),
"setWin32ProgArgv 1");
for (i = 0; i < argc; i++) {
win32_prog_argv[i] = stgMallocBytes((wcslen(argv[i]) + 1) * sizeof(wchar_t),
"setWin32ProgArgv 2");
wcscpy(win32_prog_argv[i], argv[i]);
}
win32_prog_argv[argc] = NULL;
}
#endif
/* ----------------------------------------------------------------------------
......
......@@ -13,6 +13,11 @@
/* Routines that operate-on/to-do-with RTS flags: */
#if defined(mingw32_HOST_OS)
//The returned buffer has to be freed with stgFree()
char* lpcwstrToUTF8(const wchar_t* utf16_str);
char** getUTF8Args(int* argc);
#endif
void initRtsFlagsDefaults (void);
void setupRtsFlags (int *argc, char *argv[], RtsConfig rtsConfig);
void freeRtsArgs (void);
......
......@@ -13,6 +13,7 @@
#include "RtsAPI.h"
#include "RtsUtils.h"
#include "RtsFlags.h"
#include "Prelude.h"
#include "Task.h"
#include "Excn.h"
......@@ -48,6 +49,16 @@ int hs_main ( int argc, char *argv[], // program args
int exit_status;
SchedulerStatus status;
// See Note: [Windows Unicode Arguments] in rts/RtsFlags.c
#if defined(mingw32_HOST_OS)
{
argv = getUTF8Args(&argc);
}
#endif
hs_init_ghc(&argc, &argv, rts_config);
// kick off the computation by creating the main thread with a pointer
......
......@@ -179,7 +179,33 @@ hs_init_ghc(int *argc, char **argv[], RtsConfig rts_config)
if (argc == NULL || argv == NULL) {
// Use a default for argc & argv if either is not supplied
int my_argc = 1;
#if defined(mingw32_HOST_OS)
//Retry larger buffer sizes on error up to about the NTFS length limit.
wchar_t* pathBuf;
char *my_argv[2] = { NULL, NULL };
for(DWORD maxLength = MAX_PATH; maxLength <= 33280; maxLength *= 2)
{
pathBuf = (wchar_t*) stgMallocBytes(sizeof(wchar_t) * maxLength,
"hs_init_ghc: GetModuleFileName");
DWORD pathLength = GetModuleFileNameW(NULL, pathBuf, maxLength);
if(GetLastError() == ERROR_INSUFFICIENT_BUFFER || pathLength == 0) {
stgFree(pathBuf);
pathBuf = NULL;
} else {
break;
}
}
if(pathBuf == NULL) {
my_argv[0] = "<unknown>";
} else {
my_argv[0] = lpcwstrToUTF8(pathBuf);
stgFree(pathBuf);
}
#else
char *my_argv[] = { "<unknown>", NULL };
#endif
setFullProgArgv(my_argc,my_argv);
setupRtsFlags(&my_argc, my_argv, rts_config);
} else {
......
......@@ -97,8 +97,6 @@
SymI_HasProto(stg_asyncReadzh) \
SymI_HasProto(stg_asyncWritezh) \
SymI_HasProto(stg_asyncDoProczh) \
SymI_HasProto(getWin32ProgArgv) \
SymI_HasProto(setWin32ProgArgv) \
SymI_HasProto(rts_InstallConsoleEvent) \
SymI_HasProto(rts_ConsoleHandlerDone) \
SymI_HasProto(atexit) \
......
......@@ -50,8 +50,7 @@ test('break009', [extra_files(['../Test6.hs']),
test('break010', extra_files(['../Test6.hs']), ghci_script, ['break010.script'])
test('break011',
[extra_files(['../Test7.hs']),
combined_output,
when(msys(), expect_broken(12712))],
combined_output],
ghci_script, ['break011.script'])
test('break012', normal, ghci_script, ['break012.script'])
test('break013', normal, ghci_script, ['break013.script'])
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment