Follow upstream changes -- Bytestring updates
[git-darcs-import.git] / src / Lcs.lhs
blob18f653aa2a445f5e811c0c430338c55cfb8eae96
1 % Copyright (C) 2002 David Roundy
2 % Copyright (C) 2005 Benedikt Schmidt
4 % This program is free software; you can redistribute it and/or modify
5 % it under the terms of the GNU General Public License as published by
6 % the Free Software Foundation; either version 2, or (at your option)
7 % any later version.
9 % This program is distributed in the hope that it will be useful,
10 % but WITHOUT ANY WARRANTY; without even the implied warranty of
11 % MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 % GNU General Public License for more details.
14 % You should have received a copy of the GNU General Public License
15 % along with this program; see the file COPYING. If not, write to
16 % the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
17 % Boston, MA 02110-1301, USA.
19 \chapter{LCS}
21 \section{Introduction}
22 ``LCS'' stands for ``Longest Common Subsequence,'' and it is a relatively
23 challenging problem to find an LCS efficiently. This module implements
24 the algorithm described in:
25 "An O(ND) Difference Algorithm and its Variations", Eugene Myers,
26 Algorithmica Vol. 1 No. 2, 1986, pp. 251-266;
27 especially the variation described in section 4.2 and most refinements
28 implemented in GNU diff (D is the edit-distance).
29 There is currently no heuristic to reduce the running time and produce
30 suboptimal output for large inputs with many differences. It behaves like
31 GNU diff with the -d option in this regard.
33 \begin{code}
34 {-# OPTIONS_GHC -cpp #-}
35 {-# LANGUAGE CPP #-}
37 module Lcs ( getChanges, aLen,
38 BArray, PArray, BSTArray,
39 shiftBoundaries ) where
41 import Control.Monad
42 import Data.Int
43 import Control.Monad.ST
44 import Data.Maybe
45 import ByteStringUtils (hashPS)
46 import qualified Data.ByteString as B (empty, ByteString)
47 import Data.Array.Base
48 import Data.Array.Unboxed
49 import qualified Data.Map as Map ( lookup, empty, insertWith )
50 #include "impossible.h"
52 \end{code}
54 In the first step, a hash value for every line is calculated and collisions
55 are marked with a special value. This reduces a string comparison to an
56 int comparison for line tuples where at least one of the hash values is
57 not equal to the special value. After that, lines which only exists in one
58 of the files are removed and marked as changed which reduces the running
59 time of the following difference algorithm. GNU diff additionally removes
60 lines that appear very often in the other file in some cases.
61 The last step tries to create longer changed regions and line up deletions
62 in the first file to insertions in the second by shifting changed lines
63 forward and backward.
65 \begin{code}
66 -- | create a list of changes between a and b, each change has the form
67 -- (starta, lima, startb, limb) which means that a[starta, lima)
68 -- has to be replaced by b[startb, limb)
69 getChanges :: [B.ByteString] -> [B.ByteString]
70 -> [(Int,[B.ByteString],[B.ByteString])]
71 getChanges a b = dropStart (initP a) (initP b) 1
73 dropStart :: PArray -> PArray -> Int
74 -> [(Int,[B.ByteString],[B.ByteString])]
75 dropStart a b off
76 | off > (aLen a) = [(off - 1, [], getSlice b off (aLen b))]
77 | off > (aLen b) = [(off - 1, getSlice a off (aLen a), [])]
78 | a!off == b!off = dropStart a b (off + 1)
79 | otherwise = dropEnd a b off 0
81 dropEnd :: PArray -> PArray -> Int -> Int
82 -> [(Int,[B.ByteString],[B.ByteString])]
83 dropEnd a b off end
84 | off > alast = [(off - 1, [], getSlice b off blast)]
85 | off > blast = [(off - 1, getSlice a off alast, [])]
86 | a!alast == b!blast = dropEnd a b off (end + 1)
87 | otherwise = getChanges' (a, (off, alast)) (b, (off, blast))
88 where alast = aLen a - end
89 blast = aLen b - end
91 getSlice :: PArray -> Int -> Int -> [B.ByteString]
92 getSlice a from to
93 | from > to = []
94 | otherwise = (a!(from)):(getSlice a (from + 1) to)
96 getChanges' :: (PArray, (Int, Int)) -> (PArray, (Int, Int))
97 -> [(Int,[B.ByteString],[B.ByteString])]
98 getChanges' (a, abounds) (b, bbounds) =
99 map (convertPatch 0 a b) $ createPatch c_a c_b
100 where
101 -- If the last few characters of two lines are the same, the lines are
102 -- probably the same. The choice of 20 is plucked out of the air.
103 toHash x bnds = listArray bnds [ hashPS $ x!i | i <- range bnds]
104 ah = toHash a abounds :: HArray
105 mkAMap m (i:is) =
106 let ins (_,_,_,new) (collision,_,_,old) =
107 (collision || not (new == old), True, False, old)
108 m' = Map.insertWith ins (ah!i) (False, True, False, a!i) m
109 in mkAMap m' is
110 mkAMap m _ = m
111 hm_a = mkAMap Map.empty (range abounds)
113 bh = toHash b bbounds :: HArray
114 mkBMap m (i:is) =
115 let ins (_,_,_,new) (collision,in_a,_,old) =
116 (collision || not (new == old), in_a, True, old)
117 m' = Map.insertWith ins (bh!i) (False, False, True, b!i) m
118 in mkBMap m' is
119 mkBMap m _ = m
120 hm = mkBMap hm_a (range bbounds)
121 -- take care of collisions, if there are different lines with the
122 -- same hash in both files, then set the hash to markColl,
123 -- PackedStrings are compared for two lines with the hash markColl
124 get (i, h) = case Map.lookup h hm of
125 Just (_,False,_,_) -> Nothing
126 Just (_,_,False,_) -> Nothing
127 Just (False,True,True,_) -> Just (i, h)
128 Just (True,True,True,_) -> Just (i, markColl)
129 Nothing -> impossible
131 a' = catMaybes $ map get [(i, ah!i) | i <- range (bounds ah)]
132 b' = catMaybes $ map get [(i, bh!i) | i <- range (bounds bh)]
134 (c_a, c_b) = diffArr a' b' (a, abounds) (b, bbounds)
136 -- | mark hash value where collision occured
137 markColl :: Int32
138 markColl = 2345677
140 -- | return arrays with changes in a and b (1 indexed), offsets start with 0
141 diffArr :: [(Int,Int32)] -> [(Int,Int32)]
142 -> (PArray, (Int, Int)) -> (PArray, (Int, Int))
143 -> (BArray, BArray)
144 diffArr a b (p_a, (off_a, l_a)) (p_b, (off_b, l_b)) = runST (
145 do let h_a = initH (map snd a)
146 h_b = initH (map snd b)
147 m_a = initM (map fst a)
148 m_b = initM (map fst b)
149 end_a = (aLen p_a)
150 end_b = (aLen p_b)
151 c_a <- initVChanged end_a
152 c_b <- initVChanged end_b
153 mapM_ (\ (l,_) -> writeArray c_a l False) $ a
154 mapM_ (\ (l,_) -> writeArray c_b l False) $ b
155 _ <- cmpseq h_a h_b p_a p_b m_a m_b c_a c_b 0 0 (aLen h_a) (aLen h_b)
156 let unchanged ar = do {xs <- getElems ar; return $ (length $ filter not $ xs) -1}
157 err <- liftM2 (/=) (unchanged c_a) (unchanged c_b)
158 when (err) impossible
159 -- Mark common lines at beginning and end
160 mapM_ (\ i -> writeArray c_a i False ) $ [1..(off_a - 1)]
161 mapM_ (\ i -> writeArray c_b i False ) $ [1..(off_b - 1)]
162 mapM_ (\ i -> writeArray c_a i False ) $ [(l_a + 1) .. (end_a)]
163 mapM_ (\ i -> writeArray c_b i False ) $ [(l_b + 1) .. (end_b)]
164 shiftBoundaries c_a c_b p_a 1 1
165 shiftBoundaries c_b c_a p_b 1 1
166 err1 <- liftM2 (/=) (unchanged c_a) (unchanged c_b)
167 when (err1) impossible
168 c_a' <- unsafeFreeze c_a
169 c_b' <- unsafeFreeze c_b
170 return (c_a', c_b'))
172 -- | set changes array for a and b and return number of changed lines
173 cmpseq :: HArray -> HArray -> PArray -> PArray -> MapArray -> MapArray
174 -> BSTArray s -> BSTArray s -> Int -> Int -> Int -> Int -> ST s Int
175 cmpseq _ _ _ _ _ _ _ _ _ _ 0 0 = do return 0
176 cmpseq h_a h_b p_a p_b m_a m_b c_a c_b off_a off_b l_a l_b = do
177 let lim_a = off_a+l_a
178 lim_b = off_b+l_b
179 off_a' = findSnake h_a h_b p_a p_b m_a m_b off_a off_b l_a l_b off_a off_b
180 off_b' = off_b+off_a'-off_a
181 lim_a' = findSnakeRev h_a h_b p_a p_b m_a m_b lim_a lim_b off_a' off_b'
182 lim_b' = lim_b+lim_a'-lim_a
183 l_a' = lim_a'-off_a'
184 l_b' = lim_b'-off_b'
185 if l_a' == 0 || l_b' == 0
186 then if l_a' == 0
187 then do when (l_b' > 0) $
188 mapM_ (\i -> writeArray c_b (m_b!i) True)
189 [(off_b' + 1) .. lim_b']
190 return l_b'
191 else do when (l_a' > 0) $
192 mapM_ (\i -> writeArray c_a (m_a!i) True)
193 [(off_a' + 1) .. lim_a']
194 return l_a'
195 else do let m = l_a' + l_b'
196 del = l_a' - l_b'
197 dodd = odd $ del
198 v <- initV m
199 vrev <- initVRev m l_a'
200 writeArray vrev 0 l_a'
201 writeArray v 0 0
202 (xmid, ymid, _) <- findDiag 1 h_a h_b p_a p_b m_a m_b v vrev
203 off_a' off_b' l_a' l_b' del dodd
204 when ((xmid == 0 && ymid == 0) || (xmid == l_a' && ymid == l_b')
205 || (xmid < 0 || ymid < 0 || xmid > l_a' || ymid > l_b'))
206 impossible
207 c1 <- cmpseq h_a h_b p_a p_b m_a m_b c_a c_b
208 off_a' off_b' xmid ymid
209 c2 <- cmpseq h_a h_b p_a p_b m_a m_b c_a c_b
210 (off_a' + xmid) (off_b' + ymid)
211 (l_a' - xmid) (l_b' - ymid)
212 return $ c1 + c2
214 -- | return (xmid, ymid, cost) for the two substrings
215 -- a[off_a+1..off_a+1+l_a] and b
216 findDiag :: Int -> HArray -> HArray -> PArray -> PArray -> MapArray -> MapArray
217 -> VSTArray s -> VSTArray s -> Int -> Int -> Int -> Int -> Int -> Bool
218 -> ST s (Int, Int, Int)
219 findDiag c h_a h_b p_a p_b m_a m_b v vrev off_a off_b l_a l_b del dodd = do
220 if c > l_a + l_b then error "findDiag failed" else return ()
221 r <- findF
222 case r of
223 Just (xmid, ymid) -> return (xmid, ymid, (c*2 - 1))
224 Nothing ->
225 do r' <- findR
226 case r' of
227 Just (xmid, ymid) -> return (xmid, ymid, c*2)
228 Nothing -> findDiag (c + 1) h_a h_b p_a p_b m_a m_b v vrev
229 off_a off_b l_a l_b del dodd
230 where fdmax = if c <= l_a then c else l_a - ((l_a + c) `mod` 2)
231 rdmax = if c <= l_b then c else l_b - ((l_b + c) `mod` 2)
232 lastrdmax = if (c-1) <= l_b then c-1 else l_b-((l_b + (c-1) `mod` 2))
233 lastrdmin = -(if (c-1) <= l_a then c-1 else l_a-((l_a + (c-1)) `mod` 2))
234 fdmin = -rdmax
235 rdmin = -fdmax
236 findF = findF' fdmax
237 findR = findR' rdmax
238 findF' d = do x <- findOne h_a h_b p_a p_b m_a m_b v d off_a off_b l_a l_b
239 if dodd && d - del >= lastrdmin && d - del <= lastrdmax
240 then do xr <- readArray vrev (d - del)
241 if xr <= x then return $ Just (x, x - d)
242 else if d <= fdmin then return Nothing
243 else findF' (d-2)
244 else if d <= fdmin then return Nothing else findF' (d-2)
245 findR' d = do x <- findOneRev h_a h_b p_a p_b m_a m_b vrev d del off_a off_b
246 if not dodd && (d + del >= fdmin) && (d + del <= fdmax)
247 then do xf <- readArray v (d + del)
248 if x <= xf then return $ Just (x,x-del-d)
249 else if d <= rdmin then return Nothing
250 else findR' (d-2)
251 else if d <= rdmin then return Nothing else findR' (d-2)
253 -- | find position on diag d with one more insert/delete going forward
254 findOne :: HArray -> HArray -> PArray -> PArray -> MapArray -> MapArray
255 -> VSTArray s -> Int -> Int -> Int -> Int -> Int -> ST s Int
256 findOne h_a h_b p_a p_b m_a m_b v d off_a off_b l_a l_b = do
257 x0 <- do xbelow <- readArray v (d - 1)
258 xover <- readArray v (d + 1)
259 return $ if xover > xbelow then xover else xbelow + 1
260 let y0 = x0 - d
261 x = findSnake h_a h_b p_a p_b m_a m_b (x0+off_a) (y0+off_b)
262 l_a l_b off_a off_b
263 writeArray v d (x - off_a)
264 return (x-off_a)
266 -- | follow snake from northwest to southeast, x and y are absolute positions
267 findSnake :: HArray -> HArray -> PArray -> PArray -> MapArray -> MapArray
268 -> Int -> Int -> Int -> Int -> Int -> Int -> Int
269 findSnake h_a h_b p_a p_b m_a m_b x y l_a l_b off_a off_b =
270 if x < l_a + off_a && y < l_b + off_b && h_a!(x+1) == h_b!(y+1)
271 && (h_a!(x+1) /= markColl || p_a!(m_a!(x+1)) == p_b!(m_b!(y+1)))
272 then findSnake h_a h_b p_a p_b m_a m_b (x + 1) (y + 1) l_a l_b off_a off_b
273 else x
275 -- | find position on diag d with one more insert/delete going backward
276 findOneRev :: HArray -> HArray -> PArray -> PArray -> MapArray -> MapArray
277 -> VSTArray s -> Int -> Int -> Int -> Int -> ST s Int
278 findOneRev h_a h_b p_a p_b m_a m_b v d del off_a off_b = do
279 x0 <- do xbelow <- readArray v (d - 1)
280 xover <- readArray v (d + 1)
281 return $ if xbelow < xover then xbelow else xover-1
282 let y0 = x0 - del - d
283 x = findSnakeRev h_a h_b p_a p_b m_a m_b (x0+off_a) (y0+off_b)
284 off_a off_b
285 writeArray v d (x-off_a)
286 return (x-off_a)
288 -- | follow snake from southeast to northwest, x and y are absolute positions
289 findSnakeRev :: HArray -> HArray -> PArray -> PArray -> MapArray -> MapArray
290 -> Int -> Int -> Int -> Int -> Int
291 findSnakeRev h_a h_b p_a p_b m_a m_b x y off_a off_b =
292 if x > off_a && y > off_b && h_a!x == h_b!y
293 && (h_a!x /= markColl || p_a!(m_a!x) == p_b!(m_b!y))
294 then findSnakeRev h_a h_b p_a p_b m_a m_b (x - 1) (y - 1) off_a off_b
295 else x
296 \end{code}
298 \begin{code}
300 -- | try to create nicer diffs by shifting around regions of changed lines
301 shiftBoundaries :: BSTArray s -> BSTArray s -> PArray -> Int -> Int -> ST s ()
302 shiftBoundaries c_a c_b p_a i_ j_ =
303 do x <- nextChanged c_a i_
304 case x of
305 Just start ->
306 do let skipped = start - i_
307 j1 <- nextUnchangedN c_b skipped j_
308 end <- nextUnchanged c_a start
309 j2 <- nextUnchanged c_b j1
310 (i3,j3) <- expand start end j2
311 shiftBoundaries c_a c_b p_a i3 j3
312 Nothing -> return () -- no change up to end of file
313 where noline = (aLen p_a) + 1
314 expand start i j =
315 do let len = i - start
316 (start0,i0,j0) <- shiftBackward start i j
317 b <- if j0 > 1 then readArray c_b (j0-1) else return False
318 let corr = if b then i0 else noline
319 let blank = if p_a!(i0-1) == B.empty then i0
320 else noline
321 (start1,i1,j1,corr1,blank1) <- shiftForward start0 i0 j0 corr blank
322 -- prefer corresponding to ending with blank line
323 let newi = if corr1 == noline then blank1
324 else corr1
325 (start2,i2,j2) <- moveCorr start1 i1 j1 newi
326 if len /= i2 - start2
327 then expand start2 i2 j2
328 else return (i2, j2)
329 shiftBackward start i j =
330 if start > 1 && p_a!(i-1) == p_a!(start-1)
331 then do when (i == start) impossible
332 b1 <- readArray c_a (i-1)
333 b2 <- readArray c_a (start-1)
334 when ((not b1) || b2) impossible
335 writeArray c_a (i-1) False
336 writeArray c_a (start-1) True
337 b <- if start > 2 then readArray c_a (start-2)
338 else return False
339 start' <- if b then liftM (1+) (prevUnchanged c_a (start-2))
340 else return (start-1)
341 j' <- prevUnchanged c_b (j-1)
342 shiftBackward start' (i-1) j'
343 else do return (start,i,j)
344 shiftForward start i j corr blank =
345 if i <= aLen p_a && p_a!i == p_a!start &&
346 -- B.empty at the end of file marks empty line after final newline
347 not ((i == aLen p_a) && (p_a!i == B.empty))
348 then do when (i == start) impossible
349 b1 <- readArray c_a i
350 b2 <- readArray c_a start
351 when ((not b2) || b1) impossible
352 writeArray c_a i True
353 writeArray c_a start False
354 i0 <- nextUnchanged c_a (i+1)
355 j0 <- nextUnchanged c_b (j+1)
356 let corr0 = if i0 > (i+1) then noline
357 else if j0-j > 2 then i0 else corr
358 let blank0 = if i0 > i+1 then noline
359 else if p_a!(i0-1) == B.empty then i0
360 else blank
361 shiftForward (start+1) i0 j0 corr0 blank0
362 else do return (start,i,j,corr,blank)
363 moveCorr start i j corr =
364 if corr >= i
365 then return (start,i,j)
366 else do b1 <- readArray c_a (i-1)
367 b2 <- readArray c_a (start-1)
368 when ((not b1) || b2) impossible
369 when (p_a!(i-1) /= p_a!(start-1)) impossible
370 writeArray c_a (i-1) False
371 writeArray c_a (start-1) True
372 j' <- prevUnchanged c_b (j-1)
373 moveCorr (start-1) (i-1) j' corr
375 -- | goto next unchanged line, return the given line if unchanged
376 nextUnchanged :: BSTArray s -> Int -> ST s Int
377 nextUnchanged c i = do
378 len <- aLenM c
379 if i == len + 1 then return i
380 else do b <- readArray c i
381 if b then nextUnchanged c (i+1)
382 else return i
384 -- | skip at least one unchanged line, if there is none advance
385 -- behind the last line
386 skipOneUnChanged :: BSTArray s -> Int -> ST s Int
387 skipOneUnChanged c i = do
388 len <- aLenM c
389 if i == len + 1
390 then return i
391 else do b <- readArray c i
392 if not b then return (i+1)
393 else skipOneUnChanged c (i+1)
395 -- | goto n-th next unchanged line
396 nextUnchangedN :: BSTArray s -> Int -> Int -> ST s Int
397 nextUnchangedN c n i = do
398 if n == 0 then return i
399 else do i' <- skipOneUnChanged c i
400 nextUnchangedN c (n-1) i'
402 -- | goto next changed line, return the given line if changed
403 nextChanged :: BSTArray s -> Int -> ST s (Maybe Int)
404 nextChanged c i = do
405 len <- aLenM c
406 if i <= len
407 then do b <- readArray c i
408 if not b then nextChanged c (i+1)
409 else return $ Just i
410 else return Nothing
412 -- | goto previous unchanged line, return the given line if unchanged
413 prevUnchanged :: BSTArray s -> Int -> ST s Int
414 prevUnchanged c i = do
415 b <- readArray c i
416 if b then prevUnchanged c (i-1)
417 else return i
418 \end{code}
420 \begin{code}
421 type HArray = UArray Int Int32
422 type BArray = UArray Int Bool
423 type PArray = Array Int B.ByteString
424 type MapArray = UArray Int Int
425 type VSTArray s = STUArray s Int Int
426 type BSTArray s = STUArray s Int Bool
428 initV :: Int -> ST s (VSTArray s)
429 initV dmax = do
430 newArray (-(dmax + 1), dmax + 1) (-1)
432 initVRev :: Int -> Int -> ST s (VSTArray s)
433 initVRev dmax xmax = do
434 newArray (-(dmax + 1), dmax + 1) (xmax + 1)
436 -- 1 indexed, v[0] is used as a guard element
437 initVChanged :: Int -> ST s (BSTArray s)
438 initVChanged l = do
439 a <- newArray (0, l) True
440 writeArray a 0 False
441 return a
442 -- set to false for all lines which have a mapping later
443 -- other lines are only present in one of the files
445 initH :: [Int32] -> HArray
446 initH a = listArray (0, length a) (0:a)
448 initM :: [Int] -> MapArray
449 initM a = listArray (0, length a) (0:a)
451 initP :: [B.ByteString] -> PArray
452 initP a = listArray (0, length a) (B.empty:a)
454 #if __GLASGOW_HASKELL__ > 604
455 aLen :: (IArray a e) => a Int e -> Int
456 aLen a = snd $ bounds a
457 aLenM :: (MArray a e m) => a Int e -> m Int
458 aLenM a = getBounds a >>= return . snd
459 #else
460 aLen :: HasBounds a => a Int e -> Int
461 aLen a = snd $ bounds a
462 aLenM :: (HasBounds a, Monad m) => a Int e -> m Int
463 aLenM = return . snd . bounds
464 #endif
465 \end{code}
467 \begin{code}
468 convertPatch :: Int -> PArray -> PArray -> (Int, Int, Int, Int)
469 -> (Int,[B.ByteString],[B.ByteString])
470 convertPatch off a b (a0,a1,b0,b1)
471 | b0 == b1 = (b0+off,getDelete a a0 a1,[])
472 | a0 == a1 = (b0+off,[],getInsert b b0 b1)
473 | otherwise = (b0+off,getDelete a a0 a1,getInsert b b0 b1)
475 getInsert :: PArray -> Int -> Int -> [B.ByteString]
476 getInsert b from to
477 | from >= to = []
478 | otherwise = (b!(from+1)):(getInsert b (from+1) to)
479 getDelete :: PArray -> Int -> Int -> [B.ByteString]
480 getDelete a from to
481 | from >= to = []
482 | otherwise = (a!(from+1)):(getDelete a (from+1) to)
484 createPatch :: BArray -> BArray -> [(Int, Int, Int, Int)]
485 createPatch c_a c_b =
486 reverse $ createP c_a c_b (aLen c_a) (aLen c_b)
488 createP :: BArray -> BArray -> Int -> Int -> [(Int, Int, Int, Int)]
489 createP _ _ 0 0 = []
490 createP c_a c_b ia ib =
491 if c_a!ia || c_b!ib
492 then let ia' = skipChangedRev c_a ia
493 ib' = skipChangedRev c_b ib
494 in (ia',ia,ib',ib):(createP c_a c_b ia' ib')
495 else createP c_a c_b (ia-1) (ib-1)
497 skipChangedRev :: BArray -> Int -> Int
498 skipChangedRev c i = if i >= 0 && c!i then skipChangedRev c (i-1) else i
499 \end{code}