1 {-# OPTIONS_GHC -fbang-patterns -fffi -cpp #-}
2 {-# LANGUAGE BangPatterns, ForeignFunctionInterface, CPP #-}
4 -----------------------------------------------------------------------------
6 -- Module : ByteStringUtils
7 -- Copyright : (c) The University of Glasgow 2001,
8 -- David Roundy 2003-2005
9 -- License : GPL (I'm happy to also license this file BSD style but don't
10 -- want to bother distributing two license files with darcs.
12 -- Maintainer : droundy@abridgegame.org
13 -- Stability : experimental
14 -- Portability : portable
16 -- GZIp and MMap IO for ByteStrings, and miscellaneous functions for Data.ByteString
19 module ByteStringUtils
(
24 -- IO with mmap or gzip
45 break_after_nth_newline
,
46 break_before_nth_newline
,
50 import Autoconf
( use_mmap
)
52 import qualified Data
.ByteString
as B
53 import qualified Data
.ByteString
.Char8
as BC
54 #if __GLASGOW_HASKELL__
> 606
55 import qualified Data
.ByteString
.Internal
as BI
56 import Data
.ByteString
(intercalate
, uncons
)
57 import Data
.ByteString
.Internal
(fromForeignPtr
)
59 import qualified Data
.ByteString
.Base
as BI
63 import System
.IO.Unsafe
( unsafePerformIO
)
65 #if __GLASGOW_HASKELL__
> 606
66 import Foreign
.Storable
( peekElemOff
, peek
)
68 import Foreign
.Storable
( peekElemOff
, peek
, peekByteOff
)
69 import Data
.List
( intersperse )
71 import Foreign
.Marshal
.Alloc
( free
)
72 import Foreign
.Marshal
.Array ( mallocArray
, peekArray
, advancePtr
)
73 import Foreign
.C
.Types
( CInt
, CSize
)
75 import Data
.Bits
( rotateL
)
76 import Data
.Char ( chr, ord, isSpace )
77 import Data
.Word
( Word8
)
78 import Data
.Int ( Int32
)
79 import Control
.Monad
( when )
81 import Foreign
.Ptr
( nullPtr
, plusPtr
, Ptr
)
82 import Foreign
.ForeignPtr
( ForeignPtr
, withForeignPtr
)
84 #if defined
(__GLASGOW_HASKELL__
)
85 import qualified Foreign
.Concurrent
as FC
( newForeignPtr
)
86 import System
.Posix
( handleToFd
)
90 import Foreign
.ForeignPtr
( addForeignPtrFinalizer
)
91 import Foreign
.Ptr
( FunPtr
)
95 import qualified Data
.ByteString
.Lazy
as BL
96 import qualified Codec
.Compression
.GZip
as GZ
98 import Foreign
.C
.String ( CString
, withCString
)
101 -- -----------------------------------------------------------------------------
102 -- obsolete debugging code
104 debugForeignPtr
:: ForeignPtr a
-> String -> IO ()
106 foreign import ccall unsafe
"static fpstring.h debug_alloc" debug_alloc
107 :: Ptr a
-> CString
-> IO ()
108 foreign import ccall unsafe
"static fpstring.h & debug_free" debug_free
109 :: FunPtr
(Ptr a
-> IO ())
110 debugForeignPtr fp n
=
111 withCString n
$ \cname
-> withForeignPtr fp
$ \p
->
112 do debug_alloc p cname
113 addForeignPtrFinalizer debug_free fp
115 debugForeignPtr _ _
= return ()
118 -- -----------------------------------------------------------------------------
119 -- unsafeWithInternals
121 -- | Do something with the internals of a PackedString. Beware of
122 -- altering the contents!
123 unsafeWithInternals
:: B
.ByteString
-> (Ptr Word8
-> Int -> IO a
) -> IO a
124 unsafeWithInternals ps f
125 = case BI
.toForeignPtr ps
of
126 (fp
,s
,l
) -> withForeignPtr fp
$ \p
-> f
(p `plusPtr` s
) l
128 -- | readIntPS skips any whitespace at the beginning of its argument, and
129 -- reads an Int from the beginning of the PackedString. If there is no
130 -- integer at the beginning of the string, it returns Nothing, otherwise it
131 -- just returns the int read, along with a B.ByteString containing the
132 -- remainder of its input.
134 readIntPS
:: B
.ByteString
-> Maybe (Int, B
.ByteString
)
135 readIntPS
= BC
.readInt . BC
.dropWhile isSpace
137 -- -----------------------------------------------------------------------------
138 -- Destructor functions (taking PackedStrings apart)
140 unpackPSfromUTF8
:: B
.ByteString
-> String
141 unpackPSfromUTF8 ps
=
142 case BI
.toForeignPtr ps
of
145 unsafePerformIO
$ withForeignPtr x
$ \p
->
146 do outbuf
<- mallocArray l
147 lout
<- fromIntegral `
fmap`
148 utf8_to_ints outbuf
(p `plusPtr` s
) (fromIntegral l
)
149 when (lout
< 0) $ error "Bad UTF8!"
150 str
<- (map (chr . fromIntegral)) `
fmap` peekArray lout outbuf
154 foreign import ccall unsafe
"static fpstring.h utf8_to_ints" utf8_to_ints
155 :: Ptr
Int -> Ptr Word8
-> CInt
-> IO CInt
157 -- -----------------------------------------------------------------------------
158 -- List-mimicking functions for PackedStrings
160 {-# INLINE ifHeadThenTail #-}
161 ifHeadThenTail
:: Word8
-> B
.ByteString
-> Maybe B
.ByteString
162 ifHeadThenTail c s
= case uncons s
of
163 Just
(w
, t
) | w
== c
-> Just t
166 #if __GLASGOW_HASKELL__
<= 606
167 -- | /O(1)/ Extract the head and tail of a ByteString, returning Nothing
169 uncons
:: B
.ByteString
-> Maybe (Word8
, B
.ByteString
)
172 |
otherwise = Just
(BI
.inlinePerformIO
$ withForeignPtr x
173 $ \p
-> peekByteOff p s
,
175 {-# INLINE uncons #-}
176 -- | /O(1)/ Build a ByteString from a ForeignPtr
177 fromForeignPtr
:: ForeignPtr Word8
181 fromForeignPtr fp s l
= BI
.PS fp s l
182 {-# INLINE fromForeignPtr #-}
183 -- | /O(n)/ The 'intercalate' function takes a 'ByteString' and a list of
184 -- 'ByteString's and concatenates the list after interspersing the first
185 -- argument between each element of the list.
186 intercalate
:: B
.ByteString
-> [B
.ByteString
] -> B
.ByteString
187 intercalate s
= B
.concat . (intersperse s
)
188 {-# INLINE [1] intercalate #-}
192 ------------------------------------------------------------------------
193 -- A reimplementation of Data.ByteString.Char8.dropSpace, but
194 -- specialised to darcs' need for a 4 way isspace.
196 -- TODO: if it is safe to use the expanded definition of isSpaceWord8
197 -- provided by Data.ByteString.Char8, then all this can go.
199 -- A locale-independent isspace(3) so patches are interpreted the same everywhere.
200 -- ((c) == ' ' || (c) == '\t' || (c) == '\n' || (c) == '\r')
201 isSpaceWord8
:: Word8
-> Bool
207 {-# INLINE isSpaceWord8 #-}
209 firstnonspace
:: Ptr Word8
-> Int -> Int -> IO Int
210 firstnonspace
!ptr
!n
!m
212 |
otherwise = do w
<- peekElemOff ptr n
213 if isSpaceWord8 w
then firstnonspace ptr
(n
+1) m
else return n
215 firstspace
:: Ptr Word8
-> Int -> Int -> IO Int
216 firstspace
!ptr
!n
!m
218 |
otherwise = do w
<- peekElemOff ptr n
219 if (not . isSpaceWord8
) w
then firstspace ptr
(n
+1) m
else return n
221 -- | 'dropSpace' efficiently returns the 'ByteString' argument with
222 -- white space Chars removed from the front. It is more efficient than
223 -- calling dropWhile for removing whitespace. I.e.
225 -- > dropWhile isSpace == dropSpace
227 dropSpace
:: B
.ByteString
-> B
.ByteString
228 dropSpace
(BI
.PS x s l
) = BI
.inlinePerformIO
$ withForeignPtr x
$ \p
-> do
229 i
<- firstnonspace
(p `plusPtr` s
) 0 l
230 return $! if i
== l
then B
.empty else BI
.PS x
(s
+i
) (l
-i
)
231 {-# INLINE dropSpace #-}
233 -- | 'breakSpace' returns the pair of ByteStrings when the argument is
234 -- broken at the first whitespace byte. I.e.
236 -- > break isSpace == breakSpace
238 breakSpace
:: B
.ByteString
-> (B
.ByteString
,B
.ByteString
)
239 breakSpace
(BI
.PS x s l
) = BI
.inlinePerformIO
$ withForeignPtr x
$ \p
-> do
240 i
<- firstspace
(p `plusPtr` s
) 0 l
241 return $! case () of {_
242 | i
== 0 -> (B
.empty, BI
.PS x s l
)
243 | i
== l
-> (BI
.PS x s l
, B
.empty)
244 |
otherwise -> (BI
.PS x s i
, BI
.PS x
(s
+i
) (l
-i
))
246 {-# INLINE breakSpace #-}
248 ------------------------------------------------------------------------
250 {-# INLINE is_funky #-}
251 is_funky
:: B
.ByteString
-> Bool
252 is_funky ps
= case BI
.toForeignPtr ps
of
254 unsafePerformIO
$ withForeignPtr x
$ \p
->
255 (/=0) `
fmap` has_funky_char
(p `plusPtr` s
) (fromIntegral l
)
257 foreign import ccall unsafe
"fpstring.h has_funky_char" has_funky_char
258 :: Ptr Word8
-> CInt
-> IO CInt
260 ------------------------------------------------------------------------
262 -- ByteString rewrites break (=='x') to breakByte 'x'
263 -- break ((==) x) = breakChar x
264 -- break (==x) = breakChar x
268 {-# INLINE breakOnPS #-}
269 breakOnPS
:: Char -> B
.ByteString
-> (B
.ByteString
, B
.ByteString
)
270 breakOnPS c p
= case BC
.elemIndex c p
of
271 Nothing
-> (p
, BC
.empty)
272 Just n
-> (B
.take n p
, B
.drop n p
)
275 {-# INLINE hashPS #-}
276 hashPS
:: B
.ByteString
-> Int32
278 case BI
.toForeignPtr ps
of
280 unsafePerformIO
$ withForeignPtr x
$ \p
->
281 do hash
(p `plusPtr` s
) l
283 hash
:: Ptr Word8
-> Int -> IO Int32
284 hash ptr len
= f
(0 :: Int32
) ptr len
285 where f h _
0 = return h
286 f h p n
= do x
<- peek p
287 let !h
' = (fromIntegral x
) + (rotateL h
8)
288 f h
' (p `advancePtr`
1) (n
-1)
290 {-# INLINE substrPS #-}
291 substrPS
:: B
.ByteString
-> B
.ByteString
-> Maybe Int
293 | B
.null tok
= Just
0
294 | B
.length tok
> B
.length str
= Nothing
295 |
otherwise = do n
<- BC
.elemIndex (BC
.head tok
) str
296 let ttok
= B
.tail tok
297 reststr
= B
.drop (n
+1) str
298 if ttok
== B
.take (B
.length ttok
) reststr
300 else ((n
+1)+) `
fmap` substrPS tok reststr
302 ------------------------------------------------------------------------
304 -- TODO: replace breakFirstPS and breakLastPS with definitions based on
305 -- ByteString's break/breakEnd
306 {-# INLINE breakFirstPS #-}
307 breakFirstPS
:: Char -> B
.ByteString
-> Maybe (B
.ByteString
,B
.ByteString
)
308 breakFirstPS c p
= case BC
.elemIndex c p
of
310 Just n
-> Just
(B
.take n p
, B
.drop (n
+1) p
)
312 {-# INLINE breakLastPS #-}
313 breakLastPS
:: Char -> B
.ByteString
-> Maybe (B
.ByteString
,B
.ByteString
)
314 breakLastPS c p
= case BC
.elemIndexEnd c p
of
316 Just n
-> Just
(B
.take n p
, B
.drop (n
+1) p
)
319 {-# INLINE linesPS #-}
320 linesPS
:: B
.ByteString
-> [B
.ByteString
]
322 | B
.null ps
= [B
.empty]
323 |
otherwise = BC
.split '\n' ps
325 {- QuickCheck property:
327 import Test.QuickCheck
328 import qualified Data.ByteString.Char8 as BC
330 instance Arbitrary BC.ByteString where
331 arbitrary = fmap BC.pack arbitrary
332 instance Arbitrary Char where
333 arbitrary = chr `fmap` choose (32,127)
334 deepCheck = check (defaultConfig { configMaxTest = 10000})
335 testLines = deepCheck (\x -> (linesPS x == linesPSOld x))
336 linesPSOld ps = case BC.elemIndex '\n' ps of
338 Just n -> B.take n ps : linesPS (B.drop (n+1) ps) -}
340 {-| This function acts exactly like the "Prelude" unlines function, or like
341 "Data.ByteString.Char8" 'unlines', but with one important difference: it will
342 produce a string which may not end with a newline! That is:
344 > unlinesPS ["foo", "bar"]
346 evaluates to \"foo\nbar\", not \"foo\nbar\n"! This point should hold true for
349 TODO: rename this function. -}
350 {-# INLINE unlinesPS #-}
351 unlinesPS
:: [B
.ByteString
] -> B
.ByteString
352 unlinesPS
[] = BC
.empty
353 unlinesPS x
= BC
.init $ BC
.unlines x
354 {- QuickCheck property:
356 testUnlines = deepCheck (\x -> (unlinesPS x == unlinesPSOld x))
357 unlinesPSOld ss = BC.concat $ intersperse_newlines ss
358 where intersperse_newlines (a:b:s) = a : newline : intersperse_newlines (b:s)
359 intersperse_newlines s = s
360 newline = BC.pack "\n" -}
362 -- -----------------------------------------------------------------------------
365 -- | Read an entire file, which may or may not be gzip compressed, directly
366 -- into a 'B.ByteString'.
368 #ifndef HAVE_HASKELL_ZLIB
369 foreign import ccall unsafe
"static zlib.h gzopen" c_gzopen
370 :: CString
-> CString
-> IO (Ptr
())
371 foreign import ccall unsafe
"static zlib.h gzclose" c_gzclose
373 foreign import ccall unsafe
"static zlib.h gzread" c_gzread
374 :: Ptr
() -> Ptr Word8
-> CInt
-> IO CInt
375 foreign import ccall unsafe
"static zlib.h gzwrite" c_gzwrite
376 :: Ptr
() -> Ptr Word8
-> CInt
-> IO CInt
379 gzReadFilePS
:: FilePath -> IO B
.ByteString
381 h
<- openBinaryFile f ReadMode
383 if header
/= BC
.pack
"\31\139"
386 else do hSeek h SeekFromEnd
(-4)
387 len
<- hGetLittleEndInt h
389 #ifdef HAVE_HASKELL_ZLIB
390 -- for now we ignore the length, but zlib will be modified
391 -- to add an API to say what length the result will be so
392 -- that BL.toChunks only produces one chunk, which in turn
393 -- means that B.concat won't need to copy data
394 -- the dummy use of len is to avoid a compiler warning
395 fmap (B
.concat . BL
.toChunks
. const GZ
.decompress len
) $ BL
.readFile f
-- ratify readFile: immediately consumed by the conversion to a strict bytestring
397 withCString f
$ \fstr
-> withCString
"rb" $ \rb
-> do
398 gzf
<- c_gzopen fstr rb
399 when (gzf
== nullPtr
) $ fail $ "problem opening file "++f
400 fp
<- BI
.mallocByteString len
401 debugForeignPtr fp
$ "gzReadFilePS "++f
402 lread
<- withForeignPtr fp
$ \p
->
403 c_gzread gzf p
(fromIntegral len
)
405 when (fromIntegral lread
/= len
) $
406 fail $ "problem gzreading file "++f
407 return $ fromForeignPtr fp
0 len
410 hGetLittleEndInt
:: Handle -> IO Int
411 hGetLittleEndInt h
= do
412 b1
<- ord `
fmap`
hGetChar h
413 b2
<- ord `
fmap`
hGetChar h
414 b3
<- ord `
fmap`
hGetChar h
415 b4
<- ord `
fmap`
hGetChar h
416 return $ b1
+ 256*b2
+ 65536*b3
+ 16777216*b4
418 gzWriteFilePS
:: FilePath -> B
.ByteString
-> IO ()
419 gzWriteFilePS f ps
= gzWriteFilePSs f
[ps
]
421 gzWriteFilePSs
:: FilePath -> [B
.ByteString
] -> IO ()
422 gzWriteFilePSs f pss
=
423 #ifdef HAVE_HASKELL_ZLIB
424 BL
.writeFile f
$ GZ
.compress
$ BL
.fromChunks pss
426 withCString f
$ \fstr
-> withCString
"wb" $ \wb
-> do
427 gzf
<- c_gzopen fstr wb
428 when (gzf
== nullPtr
) $ fail $ "problem gzopening file for write: "++f
429 mapM_ (gzWriteToGzf gzf
) pss `
catch`
430 \_
-> fail $ "problem gzwriting file: "++f
433 gzWriteToGzf
:: Ptr
() -> B
.ByteString
-> IO ()
434 gzWriteToGzf gzf ps
= case BI
.toForeignPtr ps
of
436 lw
<- withForeignPtr x
$ \p
-> c_gzwrite gzf
(p `plusPtr` s
)
438 when (fromIntegral lw
/= l
) $ fail $ "problem in gzWriteToGzf"
441 -- -----------------------------------------------------------------------------
444 -- | Like readFilePS, this reads an entire file directly into a
445 -- 'B.ByteString', but it is even more efficient. It involves directly
446 -- mapping the file to memory. This has the advantage that the contents of
447 -- the file never need to be copied. Also, under memory pressure the page
448 -- may simply be discarded, wile in the case of readFilePS it would need to
449 -- be written to swap. If you read many small files, mmapFilePS will be
450 -- less memory-efficient than readFilePS, since each mmapFilePS takes up a
451 -- separate page of memory. Also, you can run into bus errors if the file
452 -- is modified. NOTE: as with 'readFilePS', the string representation in
453 -- the file is assumed to be ISO-8859-1.
455 mmapFilePS
:: FilePath -> IO B
.ByteString
456 mmapFilePS f
= if use_mmap
457 then do (fp
,l
) <- mmap f
458 return $ fromForeignPtr fp
0 l
461 #if defined
(__GLASGOW_HASKELL__
)
462 foreign import ccall unsafe
"static fpstring.h my_mmap" my_mmap
463 :: CSize
-> CInt
-> IO (Ptr Word8
)
464 foreign import ccall unsafe
"static sys/mman.h munmap" c_munmap
465 :: Ptr Word8
-> CSize
-> IO CInt
466 foreign import ccall unsafe
"static unistd.h close" c_close
470 mmap
:: FilePath -> IO (ForeignPtr Word8
, Int)
472 h
<- openBinaryFile f ReadMode
473 l
<- fromIntegral `
fmap`
hFileSize h
474 -- Don't bother mmaping small files because each mmapped file takes up
475 -- at least one full VM block.
477 then do thefp
<- BI
.mallocByteString l
478 debugForeignPtr thefp
$ "mmap short file "++f
479 withForeignPtr thefp
$ \p
-> hGetBuf h p l
483 #if defined
(__GLASGOW_HASKELL__
)
484 fd
<- fromIntegral `
fmap` handleToFd h
485 p
<- my_mmap
(fromIntegral l
) fd
486 fp
<- if p
== nullPtr
491 do thefp
<- BI
.mallocByteString l
492 debugForeignPtr thefp
$ "mmap short file "++f
493 withForeignPtr thefp
$ \p
' -> hGetBuf h p
' l
495 #if defined
(__GLASGOW_HASKELL__
)
497 fp
<- FC
.newForeignPtr p
498 (do {c_munmap p
$ fromIntegral l
;
500 debugForeignPtr fp
$ "mmap "++f
506 where mmap_limit
= 16*1024
509 -- -------------------------------------------------------------------------
512 foreign import ccall unsafe
"static fpstring.h conv_to_hex" conv_to_hex
513 :: Ptr Word8
-> Ptr Word8
-> CInt
-> IO ()
515 fromPS2Hex
:: B
.ByteString
-> B
.ByteString
516 fromPS2Hex ps
= case BI
.toForeignPtr ps
of
518 BI
.unsafeCreate
(2*l
) $ \p
-> withForeignPtr x
$ \f ->
519 conv_to_hex p
(f `plusPtr` s
) $ fromIntegral l
521 -- -------------------------------------------------------------------------
524 foreign import ccall unsafe
"static fpstring.h conv_from_hex" conv_from_hex
525 :: Ptr Word8
-> Ptr Word8
-> CInt
-> IO ()
527 fromHex2PS
:: B
.ByteString
-> B
.ByteString
528 fromHex2PS ps
= case BI
.toForeignPtr ps
of
530 BI
.unsafeCreate
(l `
div`
2) $ \p
-> withForeignPtr x
$ \f ->
531 conv_from_hex p
(f `plusPtr` s
) (fromIntegral $ l `
div`
2)
533 -- -------------------------------------------------------------------------
536 -- | betweenLinesPS returns the B.ByteString between the two lines given,
537 -- or Nothing if they do not appear.
539 betweenLinesPS
:: B
.ByteString
-> B
.ByteString
-> B
.ByteString
540 -> Maybe (B
.ByteString
)
541 betweenLinesPS start end ps
542 = case break (start
==) (linesPS ps
) of
543 (_
, _
:rest
@(bs1
:_
)) ->
544 case BI
.toForeignPtr bs1
of
546 case break (end
==) rest
of
547 (_
, bs2
:_
) -> case BI
.toForeignPtr bs2
of (_
,s2
,_
) -> Just
$ fromForeignPtr ps1 s1
(s2
- s1
)
551 -- -------------------------------------------------------------------------
552 -- break_after_nth_newline
554 break_after_nth_newline
:: Int -> B
.ByteString
555 -> Maybe (B
.ByteString
, B
.ByteString
)
556 break_after_nth_newline
0 the_ps | B
.null the_ps
= Just
(B
.empty, B
.empty)
557 break_after_nth_newline n the_ps
=
558 case BI
.toForeignPtr the_ps
of
560 unsafePerformIO
$ withForeignPtr fp
$ \p
->
561 do let findit
0 s | s
== end
= return $ Just
(the_ps
, B
.empty)
562 findit _ s | s
== end
= return Nothing
563 findit
0 s
= let left_l
= s
- the_s
564 in return $ Just
(fromForeignPtr fp the_s left_l
,
565 fromForeignPtr fp s
(l
- left_l
))
566 findit i s
= do w
<- peekElemOff p s
567 if w
== nl
then findit
(i
-1) (s
+1)
573 -- -------------------------------------------------------------------------
574 -- break_before_nth_newline
576 break_before_nth_newline
:: Int -> B
.ByteString
-> (B
.ByteString
, B
.ByteString
)
577 break_before_nth_newline
0 the_ps
578 | B
.null the_ps
= (B
.empty, B
.empty)
579 break_before_nth_newline n the_ps
=
580 case BI
.toForeignPtr the_ps
of
582 unsafePerformIO
$ withForeignPtr fp
$ \p
->
583 do let findit _ s | s
== end
= return (the_ps
, B
.empty)
584 findit i s
= do w
<- peekElemOff p s
587 then let left_l
= s
- the_s
588 in return (fromForeignPtr fp the_s left_l
,
589 fromForeignPtr fp s
(l
- left_l
))
590 else findit
(i
-1) (s
+1)