-----------------------------------------------------------------------------
-- |
-- Module      :  FastPackedString
-- Copyright   :  (c) The University of Glasgow 2001, David Roundy 2003
-- License : GPL (I'm happy to also license this file BSD style but don't
--           want to bother distributing two license files with darcs.
-- 
-- Maintainer  :  droundy@abridgegame.org
-- Stability   :  experimental
-- Portability :  portable
--
-- An efficient implementation of strings.
--
-----------------------------------------------------------------------------

-- Original GHC implementation by Bryan O\'Sullivan, 
-- rewritten to use UArray by Simon Marlow.
-- rewritten to support slices and use ForeignPtr by David Roundy

module FastPackedString (
	-- * The @PackedString@ type
        PackedString,      -- abstract, instances: Eq, Ord, Show, Typeable

         -- * Converting to and from @PackedString@s
	packString,  -- :: String -> PackedString
	unpackPS,    -- :: PackedString -> String

	-- * I\/O with @PackedString@s	
	hPutPS,      -- :: Handle -> PackedString -> IO ()
	hGetPS,      -- :: Handle -> Int -> IO PackedString
	readFilePS,  -- :: FilePath -> IO PackedString
	mmapFilePS,  -- :: FilePath -> IO PackedString

	-- * List-like manipulation functions
	nilPS,       -- :: PackedString
	consPS,      -- :: Char -> PackedString -> PackedString
	headPS,      -- :: PackedString -> Char
	tailPS,      -- :: PackedString -> PackedString
	nullPS,      -- :: PackedString -> Bool
	appendPS,    -- :: PackedString -> PackedString -> PackedString
	lengthPS,    -- :: PackedString -> Int
	indexPS,     -- :: PackedString -> Int -> Char
	mapPS,       -- :: (Char -> Char) -> PackedString -> PackedString
	--filterPS,    -- :: (Char -> Bool) -> PackedString -> PackedString
	reversePS,   -- :: PackedString -> PackedString
	concatPS,    -- :: [PackedString] -> PackedString
	elemPS,      -- :: Char -> PackedString -> Bool
	substrPS,    -- :: PackedString -> Int -> Int -> PackedString
	takePS,      -- :: Int -> PackedString -> PackedString
	dropPS,      -- :: Int -> PackedString -> PackedString
	splitAtPS,   -- :: Int -> PackedString -> (PackedString, PackedString)

	foldlPS,     -- :: (a -> Char -> a) -> a -> PackedString -> a
	foldrPS,     -- :: (Char -> a -> a) -> a -> PackedString -> a
	takeWhilePS, -- :: (Char -> Bool) -> PackedString -> PackedString
	dropWhilePS, -- :: (Char -> Bool) -> PackedString -> PackedString
	spanPS,      -- :: (Char -> Bool) -> PackedString -> (PackedString, PackedString)
	breakPS,     -- :: (Char -> Bool) -> PackedString -> (PackedString, PackedString)
	linesPS,     -- :: PackedString -> [PackedString]
	unlinesPS,     -- :: [PackedString] -> PackedString

        findPS,

	wordsPS,     -- :: PackedString -> [PackedString]
	splitPS,     -- :: Char -> PackedString -> [PackedString]
	splitWithPS, -- :: (Char -> Bool) -> PackedString -> [PackedString]

--	joinPS,      -- :: PackedString -> [PackedString] -> PackedString

        breakFirstPS,-- :: Char -> PackedString -> Maybe (PackedString,PackedString)
        breakLastPS, -- :: Char -> PackedString -> Maybe (PackedString,PackedString)
    ) where

import IO ( Handle, hClose, hFileSize, IOMode(ReadMode), openFile )

import Foreign.Storable ( peekElemOff, peek, poke )
import Ptr ( nullPtr, plusPtr, Ptr )
import Foreign.Marshal.Array ( pokeArray, mallocArray )
import Data.Dynamic
import Data.Char
import Data.Word
import Monad ( liftM )
import Posix ( handleToFd, fdToInt )

import System.IO.Unsafe ( unsafePerformIO )
import System.IO ( hPutBuf, hGetBuf )

#if __GLASGOW_HASKELL__ < 600
import Foreign.ForeignPtr( ForeignPtr, withForeignPtr, newForeignPtr )
mallocForeignPtrArray :: Int -> IO (ForeignPtr Word8)
mallocForeignPtrArray i = do p <- mallocArray i
                             newForeignPtr p (return ())
#else
import Foreign.ForeignPtr( ForeignPtr, withForeignPtr, mallocForeignPtrArray )
import Foreign.Concurrent( newForeignPtr )
#endif

-- -----------------------------------------------------------------------------
-- PackedString type declaration

-- | A space-efficient representation of a 'String', which supports various
-- efficient operations.  A 'PackedString' contains full Unicode 'Char's.
data PackedString = PS !(ForeignPtr Word8) !Int !Int

{-# INLINE (!) #-}
(!) :: PackedString -> Int -> Word8
(PS x s l) ! i = unsafePerformIO $ withForeignPtr x $ \p -> peekElemOff p (s+i)
  -- | i < 0 = error "Can't access negative element in PackedString."
  -- | i >= l = error "Out of range element in PackedString."
  -- | otherwise = unsafePerformIO $ withForeignPtr x $ \p -> peekElemOff p (s+i)

instance Eq PackedString where
   (==) = pseq

{-# INLINE pseq #-}
pseq (PS x1 s1 l1) (PS x2 s2 l2) =
    (l1 == l2 &&) $ unsafePerformIO $ withForeignPtr x1 $ \p1->
    withForeignPtr x2 $ \p2 ->
    let stop = p1 `plusPtr` s1 `plusPtr` l1
        iseq :: Ptr Word8 -> Ptr Word8 -> IO Bool
        iseq w1 w2 = if w1 == stop then return True
                     else do h1 <- peek w1
                             h2 <- peek w2
                             if h1 == h2
                                then iseq (w1 `plusPtr` 1) (w2 `plusPtr` 1)
                                else return False
        in
        iseq (p1 `plusPtr` s1) (p2 `plusPtr` s2)

instance Ord PackedString where
    compare = pscmp

pscmp (PS x1 s1 l1) (PS x2 s2 l2) =
    unsafePerformIO $ withForeignPtr x1 $ \p1->
        withForeignPtr x2 $ \p2 ->
    let doc :: Ptr Word8 -> Ptr Word8 -> IO Ordering
        st1 = p1 `plusPtr` s1 `plusPtr` l1
        st2 = p2 `plusPtr` s2 `plusPtr` l2
        doc w1 w2 =
            if w1 == st1 && w2 == st2 then return EQ
            else if w1 == st1 then return LT
                 else if w2 == st2 then return GT
                      else do h1 <- peek w1
                              h2 <- peek w2
                              if h1 < h2
                                 then return LT
                                 else if h1 == h2
                                      then doc (w1 `plusPtr` 1) (w2 `plusPtr` 1)
                                      else return GT
        in
        doc (p1 `plusPtr` s1) (p2 `plusPtr` s2)

--instance Read PackedString: ToDo

instance Show PackedString where
    showsPrec p ps r = showsPrec p (unpackPS ps) r

packedStringTc = mkTyCon "PackedString"
instance Typeable PackedString where { typeOf _ = mkAppTy packedStringTc [] }

-- -----------------------------------------------------------------------------
-- Constructor functions

nilPS :: PackedString
nilPS = unsafePerformIO $ do fp <- mallocForeignPtrArray 1
                             return $ PS fp 0 0

consPS :: Char -> PackedString -> PackedString
consPS c cs = packString (c : (unpackPS cs)) -- ToDo:better

-- | Convert a 'String' into a 'PackedString'
packString :: String -> PackedString
packString str = unsafePerformIO $ do
                 fp <- mallocForeignPtrArray (length str)
                 withForeignPtr fp $ \p-> pokeArray p $ map c2w str
                 return $ PS fp 0 (length str)

w2c = chr . fromIntegral
c2w = fromIntegral . ord

-- -----------------------------------------------------------------------------
-- Destructor functions (taking PackedStrings apart)

-- | Convert a 'PackedString' into a 'String'
unpackPS :: PackedString -> String
unpackPS theps = if nullPS theps then []
                 else headPS theps : unpackPS (tailPS theps)

-- -----------------------------------------------------------------------------
-- List-mimicking functions for PackedStrings

{-# INLINE lengthPS #-}
lengthPS :: PackedString -> Int
lengthPS (PS ps s l) = l

{-# INLINE indexPS #-}
indexPS :: PackedString -> Int -> Char
indexPS theps i | i < 0 = error "Negative index in indexPS"
                | i >= lengthPS theps = error "Out of bounds in indexPS"
                | otherwise = w2c $ theps ! i

{-# INLINE headPS #-}
headPS :: PackedString -> Char
headPS ps@(PS x s l) -- ps ! 0 is inlined manually to eliminate a (+0)
  | nullPS ps = error "FastPackedString.headPS: head []"
  | otherwise  = w2c $ unsafePerformIO $ withForeignPtr x $ \p -> peekElemOff p s

{-# INLINE tailPS #-}
tailPS :: PackedString -> PackedString
tailPS ps
  | len <= 0 = error "FastPackedString.tailPS: tail []"
  | len == 1 = nilPS
  | otherwise  = substrPS ps 1 (len - 1)
  where
    len = lengthPS ps

{-# INLINE nullPS #-}
nullPS :: PackedString -> Bool
nullPS (PS ps s l) = l == 0

appendPS :: PackedString -> PackedString -> PackedString
appendPS xs ys
  | nullPS xs = ys
  | nullPS ys = xs
  | otherwise  = concatPS [xs,ys]

mapPS :: (Char -> Char) -> PackedString -> PackedString
mapPS func (PS ps s l) = unsafePerformIO $ do
    fp <- mallocForeignPtrArray l
    withForeignPtr fp $ \p->
      withForeignPtr ps $ \f-> mint f p l
    return $ PS fp 0 l
    where mint :: Ptr Word8 -> Ptr Word8 -> Int -> IO ()
          mint _ _ 0 = return ()
          mint f t l = do val <- peek f
                          poke t $ c2w $ func $ w2c val
                          mint (f `plusPtr` 1) (t `plusPtr` 1) (l - 1)

--filterPS :: (Char -> Bool) -> PackedString -> PackedString {-or String?-}
--filterPS pred ps = packString (filter pred (unpackPS ps))

foldlPS :: (a -> Char -> a) -> a -> PackedString -> a
foldlPS f b ps = foldl f b (unpackPS ps)

foldrPS :: (Char -> a -> a) -> a -> PackedString -> a
foldrPS f v ps = foldr f v (unpackPS ps)

{-# INLINE takePS #-}
takePS :: Int -> PackedString -> PackedString
takePS n ps = if n > lengthPS ps then error "Can't take that many!"
              else substrPS ps 0 (n-1)

{-# INLINE dropPS #-}
dropPS	:: Int -> PackedString -> PackedString
dropPS n ps
    | n > lengthPS ps = error "FastPackedString.dropPS: can't drop that many"
    | otherwise = substrPS ps n (lengthPS ps - 1)

{-# INLINE splitAtPS #-}
splitAtPS :: Int -> PackedString -> (PackedString, PackedString)
splitAtPS  n ps  = (takePS n ps, dropPS n ps)

findWhenPS :: (Char -> Bool) -> PackedString -> Int
findWhenPS pred ps = if nullPS ps then 0
                     else if pred $ headPS ps then 0
                          else 1 + findWhenPS pred (tailPS ps)

takeWhilePS :: (Char -> Bool) -> PackedString -> PackedString
takeWhilePS pred ps = takePS (findWhenPS (not . pred) ps) ps

dropWhilePS :: (Char -> Bool) -> PackedString -> PackedString
dropWhilePS pred ps = dropPS (findWhenPS (not . pred) ps) ps

elemPS :: Char -> PackedString -> Bool
elemPS c ps = c `elem` unpackPS ps

spanPS :: (Char -> Bool) -> PackedString -> (PackedString, PackedString)
spanPS  p ps = breakPS (not . p) ps

breakPS :: (Char -> Bool) -> PackedString -> (PackedString, PackedString)
breakPS p ps = case findWhenPS p ps of
               n -> (takePS n ps, dropPS n ps)

{-# INLINE breakFirstPS #-}
breakFirstPS :: Char -> PackedString -> Maybe (PackedString,PackedString)
breakFirstPS c p = case findPS c p of
                   Nothing -> Nothing
                   Just n -> Just (takePS n p, dropPS (n+1) p)

{-# INLINE breakLastPS #-}
breakLastPS :: Char -> PackedString -> Maybe (PackedString,PackedString)
breakLastPS c p = case findLastPS c p of
                  Nothing -> Nothing
                  Just n -> Just (takePS n p, dropPS (n+1) p)

{-# INLINE linesPS #-}
linesPS :: PackedString -> [PackedString]
linesPS ps = splitPS '\n' ps

{-# INLINE unlinesPS #-}
unlinesPS :: [PackedString] -> PackedString
unlinesPS pss = packString $ unlines $ map unpackPS pss -- FIXME optimize this!

wordsPS :: PackedString -> [PackedString]
wordsPS ps = splitWithPS isSpace ps

reversePS :: PackedString -> PackedString
reversePS ps = packString (reverse (unpackPS ps))

concatPS :: [PackedString] -> PackedString
concatPS pss = packString (concat (map unpackPS pss))

{-# INLINE findPS #-}
findPS :: Char -> PackedString -> Maybe Int
findPS c ps = wfindPS (c2w c) ps

{-# INLINE wfindPS #-}
wfindPS :: Word8 -> PackedString -> Maybe Int
wfindPS c ps@(PS x s l) =
    unsafePerformIO $ withForeignPtr x $ \p->
                    findit c (p `plusPtr` s) 0
    where findit c p i = if i >= l then return Nothing
                           else do here <- peekElemOff p i
                                   if c == here then return $ Just i
                                      else findit c p (i+1)

{-# INLINE findLastPS #-}
findLastPS :: Char -> PackedString -> Maybe Int
findLastPS c ps = wfindLastPS (c2w c) ps

{-# INLINE wfindLastPS #-}
wfindLastPS :: Word8 -> PackedString -> Maybe Int
wfindLastPS c ps@(PS x s l) =
    unsafePerformIO $ withForeignPtr x $ \p->
                    findit c (-1) (p `plusPtr` s) 0
    where findit c h p i = if i >= l
                           then if h < 0
                                then return Nothing
                                else return $ Just h
                           else do here <- peekElemOff p i
                                   if c == here
                                      then findit c i p (i+1)
                                      else findit c h p (i+1)

------------------------------------------------------------

{-# INLINE splitPS #-}
splitPS :: Char -> PackedString -> [PackedString]
splitPS c = wsplitPS (c2w c)
{-# INLINE wsplitPS #-}
wsplitPS :: Word8 -> PackedString -> [PackedString]
wsplitPS c ps@(PS x s l) = case wfindPS c ps of
                           Nothing -> if nullPS ps then [] else [ps]
                           Just n -> takePS n ps : wsplitPS c (dropPS (n+1) ps)

splitWithPS :: (Char -> Bool) -> PackedString -> [PackedString]
splitWithPS pred ps =
    case [ m | m <- [0..lengthPS ps-1], pred (w2c (ps ! m)) ] of
    [] -> if nullPS ps then [] else [ps]
    (n:_) -> takePS n ps : splitWithPS pred (dropPS (n+1) ps)

-- -----------------------------------------------------------------------------
-- Local utility functions

-- The definition of @_substrPS@ is essentially:
-- @take (end - begin + 1) (drop begin str)@.

substrPS :: PackedString -> Int -> Int -> PackedString
substrPS (PS ps s l) begin end = PS ps (s+begin) (1+end-begin)

-- -----------------------------------------------------------------------------
-- hPutPS

-- | Outputs a 'PackedString' to the specified 'Handle'.  
--
-- NOTE: the representation of the 'PackedString' in the file is assumed to
-- be in the ISO-8859-1 encoding.  In other words, only the least signficant
-- byte is taken from each character in the 'PackedString'.
hPutPS :: Handle -> PackedString -> IO ()
hPutPS h (PS _ _ 0) = return ()
hPutPS h (PS ps 0 l) = withForeignPtr ps $ \p-> hPutBuf h p l
hPutPS h (PS ps s l) = withForeignPtr ps $ \p-> hPutBuf h (p `plusPtr` s) l

-- -----------------------------------------------------------------------------
-- hGetPS

-- | Read a 'PackedString' directly from the specified 'Handle'.  This
-- is far more efficient than reading the characters into a 'String'
-- and then using 'packString'.  
--
-- NOTE: as with 'hPutPS', the string representation in the file is 
-- assumed to be ISO-8859-1.
hGetPS :: Handle -> Int -> IO PackedString
hGetPS h 0 = return nilPS
hGetPS h i = do fp <- mallocForeignPtrArray i
                l <- withForeignPtr fp $ \p-> hGetBuf h p i
                return $ PS fp 0 l

-- -----------------------------------------------------------------------------
-- readFilePS

-- | Read an entire file directly into a 'PackedString'.  This is far more
-- efficient than reading the characters into a 'String' and then using
-- 'packString'.  It also may be more efficient than opening the file and
-- reading it using hGetPS.
--
-- NOTE: as with 'hGetPS', the string representation in the file is 
-- assumed to be ISO-8859-1.

readFilePS :: FilePath -> IO PackedString
readFilePS f = do h <- openFile f ReadMode
                  l <- hFileSize h
                  s <- hGetPS h $ fromIntegral l
                  hClose h
                  return s

-- -----------------------------------------------------------------------------
-- mmapFilePS

-- | Like readFilePS, this reads an entire file directly into a
-- 'PackedString', but it is even more efficient.  It involves directly
-- mapping the file to memory.  This has the advantage that the contents of
-- the file never need to be copied.  Also, under memory pressure the page
-- may simply be discarded, wile in the case of readFilePS it would need to
-- be written to swap.  If you read many small files, mmapFilePS will be
-- less memory-efficient than readFilePS, since each mmapFilePS takes up a
-- separate page of memory.  Also, you can run into bus errors if the file
-- is modified.  NOTE: as with 'readFilePS', the string representation in
-- the file is assumed to be ISO-8859-1.

mmapFilePS :: FilePath -> IO PackedString
mmapFilePS f = do (fp,l) <- mmap f
                  return $ PS fp 0 l

foreign import ccall unsafe "static sys/mman.h mmap" c_mmap
    :: Ptr () -> Int -> Int -> Int -> Int -> Int -> IO (Ptr Word8)
foreign import ccall unsafe "static sys/mman.h munmap" c_munmap
    :: Ptr Word8 -> Int -> IO Int
prot_read = 1
map_shared = 1
map_denywrite = 2048
map_failed = nullPtr `plusPtr` (-1)

mmap :: FilePath -> IO (ForeignPtr Word8, Int)
mmap f = do h <- openFile f ReadMode
            l <- fromIntegral `liftM` hFileSize h
            fd <- handleToFd h
            p <- c_mmap nullPtr l prot_read
                 (map_shared + map_denywrite) (fdToInt fd) 0
            fp <- if p == map_failed
                  then do thefp <- mallocForeignPtrArray l
                          withForeignPtr thefp $ \p-> hGetBuf h p l
                          return thefp
                  else newForeignPtr p (do {c_munmap p l; return (); })
            hClose h
            return (fp, l)
