From 618648708051d5fee43a0cbc704e88823f9037cd Mon Sep 17 00:00:00 2001 From: Julian Ospald Date: Sun, 9 Jun 2024 18:34:58 +0800 Subject: [PATCH] Add encodeLE/decodeLE, fixing #19 --- System/OsString.hs | 40 ++++++++++++++++++-- System/OsString/Common.hs | 56 ++++++++++++++++++++++++++-- System/OsString/Encoding.hs | 2 + System/OsString/Encoding/Internal.hs | 24 ++++++++++-- System/OsString/Internal.hs | 44 ++++++++++++++++++++-- changelog.md | 4 ++ os-string.cabal | 2 +- 7 files changed, 158 insertions(+), 14 deletions(-) diff --git a/System/OsString.hs b/System/OsString.hs index c4664af..f4ce501 100644 --- a/System/OsString.hs +++ b/System/OsString.hs @@ -24,6 +24,7 @@ module System.OsString , unsafeEncodeUtf , encodeWith , encodeFS + , encodeLE , osstr , empty , singleton @@ -33,6 +34,7 @@ module System.OsString , decodeUtf , decodeWith , decodeFS + , decodeLE , unpack -- * Word types @@ -136,14 +138,14 @@ import System.OsString.Internal , encodeUtf , unsafeEncodeUtf , encodeWith - , encodeFS + , encodeLE , osstr , pack , empty , singleton , decodeUtf , decodeWith - , decodeFS + , decodeLE , unpack , snoc , cons @@ -206,6 +208,38 @@ import System.OsString.Internal , findIndex , findIndices ) +import qualified System.OsString.Internal as SOI import System.OsString.Internal.Types ( OsString, OsChar, coercionToPlatformTypes ) -import Prelude () +import Prelude (String, IO) + +{-# DEPRECATED encodeFS "Use System.OsPath.encodeFS from filepath" #-} +-- | Like 'encodeUtf', except this mimics the behavior of the base library when doing filesystem +-- operations (usually filepaths), which is: +-- +-- 1. on unix, uses shady PEP 383 style encoding (based on the current locale, +-- but PEP 383 only works properly on UTF-8 encodings, so good luck) +-- 2. on windows does permissive UTF-16 encoding, where coding errors generate +-- Chars in the surrogate range +-- +-- Looking up the locale requires IO. If you're not worried about calls +-- to 'setFileSystemEncoding', then 'unsafePerformIO' may be feasible (make sure +-- to deeply evaluate the result to catch exceptions). +encodeFS :: String -> IO OsString +encodeFS = SOI.encodeFS + +{-# DEPRECATED decodeFS "Use System.OsPath.encodeFS from filepath" #-} +-- | Like 'decodeUtf', except this mimics the behavior of the base library when doing filesystem +-- operations (usually filepaths), which is: +-- +-- 1. on unix, uses shady PEP 383 style encoding (based on the current locale, +-- but PEP 383 only works properly on UTF-8 encodings, so good luck) +-- 2. on windows does permissive UTF-16 encoding, where coding errors generate +-- Chars in the surrogate range +-- +-- Looking up the locale requires IO. If you're not worried about calls +-- to 'setFileSystemEncoding', then 'unsafePerformIO' may be feasible (make sure +-- to deeply evaluate the result to catch exceptions). +decodeFS :: OsString -> IO String +decodeFS = SOI.decodeFS + diff --git a/System/OsString/Common.hs b/System/OsString/Common.hs index d225854..1ca38ea 100644 --- a/System/OsString/Common.hs +++ b/System/OsString/Common.hs @@ -33,6 +33,7 @@ module System.OsString.MODULE_NAME , unsafeEncodeUtf , encodeWith , encodeFS + , encodeLE , fromBytes , pstr , singleton @@ -43,6 +44,7 @@ module System.OsString.MODULE_NAME , decodeUtf , decodeWith , decodeFS + , decodeLE , unpack -- * Word construction @@ -242,14 +244,14 @@ encodeWith enc str = unsafePerformIO $ do #ifdef WINDOWS_DOC -- | This mimics the behavior of the base library when doing filesystem --- operations, which does permissive UTF-16 encoding, where coding errors generate +-- operations (usually filepaths), which does permissive UTF-16 encoding, where coding errors generate -- Chars in the surrogate range. -- -- The reason this is in IO is because it unifies with the Posix counterpart, -- which does require IO. This is safe to 'unsafePerformIO'/'unsafeDupablePerformIO'. #else -- | This mimics the behavior of the base library when doing filesystem --- operations, which uses shady PEP 383 style encoding (based on the current locale, +-- operations (usually filepaths), which uses shady PEP 383 style encoding (based on the current locale, -- but PEP 383 only works properly on UTF-8 encodings, so good luck). -- -- Looking up the locale requires IO. If you're not worried about calls @@ -258,11 +260,35 @@ encodeWith enc str = unsafePerformIO $ do #endif encodeFS :: String -> IO PLATFORM_STRING #ifdef WINDOWS +{-# DEPRECATED encodeFS "Use System.OsPath.Windows.encodeFS from filepath" #-} encodeFS = fmap WindowsString . encodeWithBaseWindows #else +{-# DEPRECATED encodeFS "Use System.OsPath.Posix.encodeFS from filepath" #-} encodeFS = fmap PosixString . encodeWithBasePosix #endif +#ifdef WINDOWS_DOC +-- | This mimics the behavior of the base library when doing string +-- operations, which does permissive UTF-16 encoding, where coding errors generate +-- Chars in the surrogate range. +-- +-- The reason this is in IO is because it unifies with the Posix counterpart, +-- which does require IO. This is safe to 'unsafePerformIO'/'unsafeDupablePerformIO'. +#else +-- | This mimics the behavior of the base library when doing string +-- operations, which uses 'getLocaleEncoding'. +-- +-- Looking up the locale requires IO. If you're not worried about calls +-- to 'setFileSystemEncoding', then 'unsafePerformIO' may be feasible (make sure +-- to deeply evaluate the result to catch exceptions). +#endif +encodeLE :: String -> IO PLATFORM_STRING +#ifdef WINDOWS +encodeLE = fmap WindowsString . encodeWithBaseWindows +#else +encodeLE = fmap PosixString . encodeWithBasePosix' +#endif + #ifdef WINDOWS_DOC -- | Partial unicode friendly decoding. @@ -317,7 +343,29 @@ decodeWith unixEnc (PosixString ba) = unsafePerformIO $ do -- which does require IO. 'unsafePerformIO'/'unsafeDupablePerformIO' are safe, however. #else -- | This mimics the behavior of the base library when doing filesystem --- operations, which uses shady PEP 383 style encoding (based on the current locale, +-- operations, which uses 'getLocaleEncoding'. +-- +-- Looking up the locale requires IO. If you're not worried about calls +-- to 'setFileSystemEncoding', then 'unsafePerformIO' may be feasible (make sure +-- to deeply evaluate the result to catch exceptions). +#endif +decodeLE :: PLATFORM_STRING -> IO String +#ifdef WINDOWS +decodeLE (WindowsString ba) = decodeWithBaseWindows ba +#else +decodeLE (PosixString ba) = decodeWithBasePosix' ba +#endif + +#ifdef WINDOWS_DOC +-- | Like 'decodeUtf', except this mimics the behavior of the base library when doing filesystem +-- operations (usually filepaths), which does permissive UTF-16 encoding, where coding errors generate +-- Chars in the surrogate range. +-- +-- The reason this is in IO is because it unifies with the Posix counterpart, +-- which does require IO. 'unsafePerformIO'/'unsafeDupablePerformIO' are safe, however. +#else +-- | This mimics the behavior of the base library when doing filesystem +-- operations (usually filepaths), which uses shady PEP 383 style encoding (based on the current locale, -- but PEP 383 only works properly on UTF-8 encodings, so good luck). -- -- Looking up the locale requires IO. If you're not worried about calls @@ -326,8 +374,10 @@ decodeWith unixEnc (PosixString ba) = unsafePerformIO $ do #endif decodeFS :: PLATFORM_STRING -> IO String #ifdef WINDOWS +{-# DEPRECATED decodeFS "Use System.OsPath.Windows.decodeFS from filepath" #-} decodeFS (WindowsString ba) = decodeWithBaseWindows ba #else +{-# DEPRECATED decodeFS "Use System.OsPath.Posix.decodeFS from filepath" #-} decodeFS (PosixString ba) = decodeWithBasePosix ba #endif diff --git a/System/OsString/Encoding.hs b/System/OsString/Encoding.hs index 2e6c02e..c17abb3 100644 --- a/System/OsString/Encoding.hs +++ b/System/OsString/Encoding.hs @@ -23,6 +23,8 @@ module System.OsString.Encoding -- * base encoding , encodeWithBasePosix , decodeWithBasePosix + , encodeWithBasePosix' + , decodeWithBasePosix' , encodeWithBaseWindows , decodeWithBaseWindows ) diff --git a/System/OsString/Encoding/Internal.hs b/System/OsString/Encoding/Internal.hs index 08f2af2..3466ac1 100644 --- a/System/OsString/Encoding/Internal.hs +++ b/System/OsString/Encoding/Internal.hs @@ -31,7 +31,7 @@ import Numeric (showHex) import Foreign.C (CStringLen) import Data.Char (chr) import Foreign -import GHC.IO.Encoding (getFileSystemEncoding) +import GHC.IO.Encoding (getFileSystemEncoding, getLocaleEncoding) -- ----------------------------------------------------------------------------- -- UCS-2 LE @@ -270,9 +270,15 @@ peekWindowsString (cp, l) = do withPosixString :: String -> (CStringLen -> IO a) -> IO a withPosixString fp f = getFileSystemEncoding >>= \enc -> GHC.withCStringLen enc fp f +withPosixString' :: String -> (CStringLen -> IO a) -> IO a +withPosixString' fp f = getLocaleEncoding >>= \enc -> GHC.withCStringLen enc fp f + peekPosixString :: CStringLen -> IO String peekPosixString fp = getFileSystemEncoding >>= \enc -> GHC.peekCStringLen enc fp +peekPosixString' :: CStringLen -> IO String +peekPosixString' fp = getLocaleEncoding >>= \enc -> GHC.peekCStringLen enc fp + -- | Decode with the given 'TextEncoding'. decodeWithTE :: TextEncoding -> BS8.ShortByteString -> Either EncodingException String decodeWithTE enc ba = unsafePerformIO $ do @@ -289,18 +295,30 @@ encodeWithTE enc str = unsafePerformIO $ do -- Encoders / decoders -- --- | This mimics the filepath decoder base uses on unix, +-- | This mimics the filepath decoder base uses on unix (using PEP-383), -- with the small distinction that we're not truncating at NUL bytes (because we're not at -- the outer FFI layer). decodeWithBasePosix :: BS8.ShortByteString -> IO String decodeWithBasePosix ba = BS8.useAsCStringLen ba $ \fp -> peekPosixString fp --- | This mimics the filepath dencoder base uses on unix, +-- | This mimics the string decoder base uses on unix, +-- with the small distinction that we're not truncating at NUL bytes (because we're not at +-- the outer FFI layer). +decodeWithBasePosix' :: BS8.ShortByteString -> IO String +decodeWithBasePosix' ba = BS8.useAsCStringLen ba $ \fp -> peekPosixString' fp + +-- | This mimics the filepath encoder base uses on unix (using PEP-383), -- with the small distinction that we're not truncating at NUL bytes (because we're not at -- the outer FFI layer). encodeWithBasePosix :: String -> IO BS8.ShortByteString encodeWithBasePosix str = withPosixString str $ \cstr -> BS8.packCStringLen cstr +-- | This mimics the string encoder base uses on unix, +-- with the small distinction that we're not truncating at NUL bytes (because we're not at +-- the outer FFI layer). +encodeWithBasePosix' :: String -> IO BS8.ShortByteString +encodeWithBasePosix' str = withPosixString' str $ \cstr -> BS8.packCStringLen cstr + -- | This mimics the filepath decoder base uses on windows, -- with the small distinction that we're not truncating at NUL bytes (because we're not at -- the outer FFI layer). diff --git a/System/OsString/Internal.hs b/System/OsString/Internal.hs index 1753d58..7f3d284 100644 --- a/System/OsString/Internal.hs +++ b/System/OsString/Internal.hs @@ -26,9 +26,11 @@ import System.OsString.Encoding ( EncodingException(..) ) import GHC.IO.Encoding.Failure ( CodingFailureMode(..) ) #if defined(mingw32_HOST_OS) || defined(__MINGW32__) import GHC.IO.Encoding.UTF16 ( mkUTF16le ) +import System.OsString.Encoding ( encodeWithBaseWindows, decodeWithBaseWindows ) import qualified System.OsString.Windows as PF #else import GHC.IO.Encoding.UTF8 ( mkUTF8 ) +import System.OsString.Encoding ( encodeWithBasePosix, decodeWithBasePosix ) import qualified System.OsString.Posix as PF #endif import GHC.Stack (HasCallStack) @@ -71,7 +73,7 @@ encodeWith unixEnc _ str = OsString <$> PF.encodeWith unixEnc str #endif -- | Like 'encodeUtf', except this mimics the behavior of the base library when doing filesystem --- operations, which is: +-- operations (usually filepaths), which is: -- -- 1. on unix, uses shady PEP 383 style encoding (based on the current locale, -- but PEP 383 only works properly on UTF-8 encodings, so good luck) @@ -82,7 +84,24 @@ encodeWith unixEnc _ str = OsString <$> PF.encodeWith unixEnc str -- to 'setFileSystemEncoding', then 'unsafePerformIO' may be feasible (make sure -- to deeply evaluate the result to catch exceptions). encodeFS :: String -> IO OsString -encodeFS = fmap OsString . PF.encodeFS +#if defined(mingw32_HOST_OS) || defined(__MINGW32__) +encodeFS = fmap (OsString . WindowsString) . encodeWithBaseWindows +#else +encodeFS = fmap (OsString . PosixString) . encodeWithBasePosix +#endif + +-- | Like 'encodeUtf', except this mimics the behavior of the base library when doing string +-- operations, which is: +-- +-- 1. on unix this uses 'getLocaleEncoding' +-- 2. on windows does permissive UTF-16 encoding, where coding errors generate +-- Chars in the surrogate range +-- +-- Looking up the locale requires IO. If you're not worried about calls +-- to 'setFileSystemEncoding', then 'unsafePerformIO' may be feasible (make sure +-- to deeply evaluate the result to catch exceptions). +encodeLE :: String -> IO OsString +encodeLE = fmap OsString . PF.encodeLE -- | Partial unicode friendly decoding. @@ -110,7 +129,7 @@ decodeWith unixEnc _ (OsString x) = PF.decodeWith unixEnc x -- | Like 'decodeUtf', except this mimics the behavior of the base library when doing filesystem --- operations, which is: +-- operations (usually filepaths), which is: -- -- 1. on unix, uses shady PEP 383 style encoding (based on the current locale, -- but PEP 383 only works properly on UTF-8 encodings, so good luck) @@ -121,7 +140,24 @@ decodeWith unixEnc _ (OsString x) = PF.decodeWith unixEnc x -- to 'setFileSystemEncoding', then 'unsafePerformIO' may be feasible (make sure -- to deeply evaluate the result to catch exceptions). decodeFS :: OsString -> IO String -decodeFS (OsString x) = PF.decodeFS x +#if defined(mingw32_HOST_OS) || defined(__MINGW32__) +decodeFS (OsString (WindowsString x)) = decodeWithBaseWindows x +#else +decodeFS (OsString (PosixString x)) = decodeWithBasePosix x +#endif + +-- | Like 'decodeUtf', except this mimics the behavior of the base library when doing string operations, +-- which is: +-- +-- 1. on unix this uses 'getLocaleEncoding' +-- 2. on windows does permissive UTF-16 encoding, where coding errors generate +-- Chars in the surrogate range +-- +-- Looking up the locale requires IO. If you're not worried about calls +-- to 'setFileSystemEncoding', then 'unsafePerformIO' may be feasible (make sure +-- to deeply evaluate the result to catch exceptions). +decodeLE :: OsString -> IO String +decodeLE (OsString x) = PF.decodeLE x -- | Constructs an @OsString@ from a ByteString. diff --git a/changelog.md b/changelog.md index 79e4b4b..dcfe77c 100644 --- a/changelog.md +++ b/changelog.md @@ -1,5 +1,9 @@ # Changelog for [`os-string` package](http://hackage.haskell.org/package/os-string) +## 2.0.5 *Jun 2024* + +* Add `decodeLE`/`encodeLE` and deprecate `decodeFS`/`encodeFS` (pointing users to `System.OsPath` instead), fixes [#19](https://github.com/haskell/os-string/issues/19) + ## 2.0.3 *May 2024* * Fix `length` function wrt [#17](https://github.com/haskell/os-string/issues/17) diff --git a/os-string.cabal b/os-string.cabal index 827da78..7698c19 100644 --- a/os-string.cabal +++ b/os-string.cabal @@ -1,6 +1,6 @@ cabal-version: 2.2 name: os-string -version: 2.0.3 +version: 2.0.5 -- NOTE: Don't forget to update ./changelog.md license: BSD-3-Clause