1 % Copyright (C) 2002 David Roundy
2 % Copyright (C) 2005 Benedikt Schmidt
4 % This program is free software; you can redistribute it and/or modify
5 % it under the terms of the GNU General Public License as published by
6 % the Free Software Foundation; either version 2, or (at your option)
9 % This program is distributed in the hope that it will be useful,
10 % but WITHOUT ANY WARRANTY; without even the implied warranty of
11 % MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 % GNU General Public License for more details.
14 % You should have received a copy of the GNU General Public License
15 % along with this program; see the file COPYING. If not, write to
16 % the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
17 % Boston, MA 02110-1301, USA.
21 \section{Introduction}
22 ``LCS'' stands for ``Longest Common Subsequence,'' and it is a relatively
23 challenging problem to find an LCS efficiently. This module implements
24 the algorithm described in:
25 "An O(ND) Difference Algorithm and its Variations", Eugene Myers,
26 Algorithmica Vol. 1 No. 2, 1986, pp. 251-266;
27 especially the variation described in section 4.2 and most refinements
28 implemented in GNU diff (D is the edit-distance).
29 There is currently no heuristic to reduce the running time and produce
30 suboptimal output for large inputs with many differences. It behaves like
31 GNU diff with the -d option in this regard.
34 {-# OPTIONS_GHC -cpp #-}
35 module Lcs ( getChanges, aLen,
36 BArray, PArray, BSTArray,
37 shiftBoundaries ) where
41 import Control.Monad.ST
43 import FastPackedString
44 import Data.Array.Base
45 import Data.Array.Unboxed
46 import qualified Data.Map as Map ( lookup, empty, insertWith )
47 #include "impossible.h"
51 In the first step, a hash value for every line is calculated and collisions
52 are marked with a special value. This reduces a string comparison to an
53 int comparison for line tuples where at least one of the hash values is
54 not equal to the special value. After that, lines which only exists in one
55 of the files are removed and marked as changed which reduces the running
56 time of the following difference algorithm. GNU diff additionally removes
57 lines that appear very often in the other file in some cases.
58 The last step tries to create longer changed regions and line up deletions
59 in the first file to insertions in the second by shifting changed lines
63 -- | create a list of changes between a and b, each change has the form
64 -- (starta, lima, startb, limb) which means that a[starta, lima)
65 -- has to be replaced by b[startb, limb)
66 getChanges :: [PackedString] -> [PackedString]
67 -> [(Int,[PackedString],[PackedString])]
68 getChanges a b = dropStart (initP a) (initP b) 1
70 dropStart :: PArray -> PArray -> Int
71 -> [(Int,[PackedString],[PackedString])]
73 | off > (aLen a) = [(off - 1, [], getSlice b off (aLen b))]
74 | off > (aLen b) = [(off - 1, getSlice a off (aLen a), [])]
75 | a!off == b!off = dropStart a b (off + 1)
76 | otherwise = dropEnd a b off 0
78 dropEnd :: PArray -> PArray -> Int -> Int
79 -> [(Int,[PackedString],[PackedString])]
81 | off > alast = [(off - 1, [], getSlice b off blast)]
82 | off > blast = [(off - 1, getSlice a off alast, [])]
83 | a!alast == b!blast = dropEnd a b off (end + 1)
84 | otherwise = getChanges' (a, (off, alast)) (b, (off, blast))
85 where alast = aLen a - end
88 getSlice :: PArray -> Int -> Int -> [PackedString]
91 | otherwise = (a!(from)):(getSlice a (from + 1) to)
93 getChanges' :: (PArray, (Int, Int)) -> (PArray, (Int, Int))
94 -> [(Int,[PackedString],[PackedString])]
95 getChanges' (a, abounds) (b, bbounds) =
96 map (convertPatch 0 a b) $ createPatch c_a c_b
97 where toHash x bnds = listArray bnds [ hashPS $ x!i | i <- range bnds]
98 ah = toHash a abounds :: HArray
100 let ins (_,_,_,new) (collision,_,_,old) =
101 (collision || not (new == old), True, False, old)
102 m' = Map.insertWith ins (ah!i) (False, True, False, a!i) m
105 hm_a = mkAMap Map.empty (range abounds)
107 bh = toHash b bbounds :: HArray
109 let ins (_,_,_,new) (collision,in_a,_,old) =
110 (collision || not (new == old), in_a, True, old)
111 m' = Map.insertWith ins (bh!i) (False, False, True, b!i) m
114 hm = mkBMap hm_a (range bbounds)
115 -- take care of collisions, if there are different lines with the
116 -- same hash in both files, then set the hash to markColl,
117 -- PackedStrings are compared for two lines with the hash markColl
118 get (i, h) = case Map.lookup h hm of
119 Just (_,False,_,_) -> Nothing
120 Just (_,_,False,_) -> Nothing
121 Just (False,True,True,_) -> Just (i, h)
122 Just (True,True,True,_) -> Just (i, markColl)
123 Nothing -> impossible
125 a' = catMaybes $ map get [(i, ah!i) | i <- range (bounds ah)]
126 b' = catMaybes $ map get [(i, bh!i) | i <- range (bounds bh)]
128 (c_a, c_b) = diffArr a' b' (a, abounds) (b, bbounds)
130 -- | mark hash value where collision occured
134 -- | return arrays with changes in a and b (1 indexed), offsets start with 0
135 diffArr :: [(Int,Int32)] -> [(Int,Int32)]
136 -> (PArray, (Int, Int)) -> (PArray, (Int, Int))
138 diffArr a b (p_a, (off_a, l_a)) (p_b, (off_b, l_b)) = runST (
139 do let h_a = initH (map snd a)
140 h_b = initH (map snd b)
141 m_a = initM (map fst a)
142 m_b = initM (map fst b)
145 c_a <- initVChanged end_a
146 c_b <- initVChanged end_b
147 mapM_ (\ (l,_) -> writeArray c_a l False) $ a
148 mapM_ (\ (l,_) -> writeArray c_b l False) $ b
149 _ <- cmpseq h_a h_b p_a p_b m_a m_b c_a c_b 0 0 (aLen h_a) (aLen h_b)
150 let unchanged ar = do {xs <- getElems ar; return $ (length $ filter not $ xs) -1}
151 err <- liftM2 (/=) (unchanged c_a) (unchanged c_b)
152 when (err) impossible
153 -- Mark common lines at beginning and end
154 mapM_ (\ i -> writeArray c_a i False ) $ [1..(off_a - 1)]
155 mapM_ (\ i -> writeArray c_b i False ) $ [1..(off_b - 1)]
156 mapM_ (\ i -> writeArray c_a i False ) $ [(l_a + 1) .. (end_a)]
157 mapM_ (\ i -> writeArray c_b i False ) $ [(l_b + 1) .. (end_b)]
158 shiftBoundaries c_a c_b p_a 1 1
159 shiftBoundaries c_b c_a p_b 1 1
160 err1 <- liftM2 (/=) (unchanged c_a) (unchanged c_b)
161 when (err1) impossible
162 c_a' <- unsafeFreeze c_a
163 c_b' <- unsafeFreeze c_b
166 -- | set changes array for a and b and return number of changed lines
167 cmpseq :: HArray -> HArray -> PArray -> PArray -> MapArray -> MapArray
168 -> BSTArray s -> BSTArray s -> Int -> Int -> Int -> Int -> ST s Int
169 cmpseq _ _ _ _ _ _ _ _ _ _ 0 0 = do return 0
170 cmpseq h_a h_b p_a p_b m_a m_b c_a c_b off_a off_b l_a l_b = do
171 let lim_a = off_a+l_a
173 off_a' = findSnake h_a h_b p_a p_b m_a m_b off_a off_b l_a l_b off_a off_b
174 off_b' = off_b+off_a'-off_a
175 lim_a' = findSnakeRev h_a h_b p_a p_b m_a m_b lim_a lim_b off_a' off_b'
176 lim_b' = lim_b+lim_a'-lim_a
179 if l_a' == 0 || l_b' == 0
181 then do when (l_b' > 0) $
182 mapM_ (\i -> writeArray c_b (m_b!i) True)
183 [(off_b' + 1) .. lim_b']
185 else do when (l_a' > 0) $
186 mapM_ (\i -> writeArray c_a (m_a!i) True)
187 [(off_a' + 1) .. lim_a']
189 else do let m = l_a' + l_b'
193 vrev <- initVRev m l_a'
194 writeArray vrev 0 l_a'
196 (xmid, ymid, _) <- findDiag 1 h_a h_b p_a p_b m_a m_b v vrev
197 off_a' off_b' l_a' l_b' del dodd
198 when ((xmid == 0 && ymid == 0) || (xmid == l_a' && ymid == l_b')
199 || (xmid < 0 || ymid < 0 || xmid > l_a' || ymid > l_b'))
201 c1 <- cmpseq h_a h_b p_a p_b m_a m_b c_a c_b
202 off_a' off_b' xmid ymid
203 c2 <- cmpseq h_a h_b p_a p_b m_a m_b c_a c_b
204 (off_a' + xmid) (off_b' + ymid)
205 (l_a' - xmid) (l_b' - ymid)
208 -- | return (xmid, ymid, cost) for the two substrings
209 -- a[off_a+1..off_a+1+l_a] and b
210 findDiag :: Int -> HArray -> HArray -> PArray -> PArray -> MapArray -> MapArray
211 -> VSTArray s -> VSTArray s -> Int -> Int -> Int -> Int -> Int -> Bool
212 -> ST s (Int, Int, Int)
213 findDiag c h_a h_b p_a p_b m_a m_b v vrev off_a off_b l_a l_b del dodd = do
214 if c > l_a + l_b then error "findDiag failed" else return ()
217 Just (xmid, ymid) -> return (xmid, ymid, (c*2 - 1))
221 Just (xmid, ymid) -> return (xmid, ymid, c*2)
222 Nothing -> findDiag (c + 1) h_a h_b p_a p_b m_a m_b v vrev
223 off_a off_b l_a l_b del dodd
224 where fdmax = if c <= l_a then c else l_a - ((l_a + c) `mod` 2)
225 rdmax = if c <= l_b then c else l_b - ((l_b + c) `mod` 2)
226 lastrdmax = if (c-1) <= l_b then c-1 else l_b-((l_b + (c-1) `mod` 2))
227 lastrdmin = -(if (c-1) <= l_a then c-1 else l_a-((l_a + (c-1)) `mod` 2))
232 findF' d = do x <- findOne h_a h_b p_a p_b m_a m_b v d off_a off_b l_a l_b
233 if dodd && d - del >= lastrdmin && d - del <= lastrdmax
234 then do xr <- readArray vrev (d - del)
235 if xr <= x then return $ Just (x, x - d)
236 else if d <= fdmin then return Nothing
238 else if d <= fdmin then return Nothing else findF' (d-2)
239 findR' d = do x <- findOneRev h_a h_b p_a p_b m_a m_b vrev d del off_a off_b
240 if not dodd && (d + del >= fdmin) && (d + del <= fdmax)
241 then do xf <- readArray v (d + del)
242 if x <= xf then return $ Just (x,x-del-d)
243 else if d <= rdmin then return Nothing
245 else if d <= rdmin then return Nothing else findR' (d-2)
247 -- | find position on diag d with one more insert/delete going forward
248 findOne :: HArray -> HArray -> PArray -> PArray -> MapArray -> MapArray
249 -> VSTArray s -> Int -> Int -> Int -> Int -> Int -> ST s Int
250 findOne h_a h_b p_a p_b m_a m_b v d off_a off_b l_a l_b = do
251 x0 <- do xbelow <- readArray v (d - 1)
252 xover <- readArray v (d + 1)
253 return $ if xover > xbelow then xover else xbelow + 1
255 x = findSnake h_a h_b p_a p_b m_a m_b (x0+off_a) (y0+off_b)
257 writeArray v d (x - off_a)
260 -- | follow snake from northwest to southeast, x and y are absolute positions
261 findSnake :: HArray -> HArray -> PArray -> PArray -> MapArray -> MapArray
262 -> Int -> Int -> Int -> Int -> Int -> Int -> Int
263 findSnake h_a h_b p_a p_b m_a m_b x y l_a l_b off_a off_b =
264 if x < l_a + off_a && y < l_b + off_b && h_a!(x+1) == h_b!(y+1)
265 && (h_a!(x+1) /= markColl || p_a!(m_a!(x+1)) == p_b!(m_b!(y+1)))
266 then findSnake h_a h_b p_a p_b m_a m_b (x + 1) (y + 1) l_a l_b off_a off_b
269 -- | find position on diag d with one more insert/delete going backward
270 findOneRev :: HArray -> HArray -> PArray -> PArray -> MapArray -> MapArray
271 -> VSTArray s -> Int -> Int -> Int -> Int -> ST s Int
272 findOneRev h_a h_b p_a p_b m_a m_b v d del off_a off_b = do
273 x0 <- do xbelow <- readArray v (d - 1)
274 xover <- readArray v (d + 1)
275 return $ if xbelow < xover then xbelow else xover-1
276 let y0 = x0 - del - d
277 x = findSnakeRev h_a h_b p_a p_b m_a m_b (x0+off_a) (y0+off_b)
279 writeArray v d (x-off_a)
282 -- | follow snake from southeast to northwest, x and y are absolute positions
283 findSnakeRev :: HArray -> HArray -> PArray -> PArray -> MapArray -> MapArray
284 -> Int -> Int -> Int -> Int -> Int
285 findSnakeRev h_a h_b p_a p_b m_a m_b x y off_a off_b =
286 if x > off_a && y > off_b && h_a!x == h_b!y
287 && (h_a!x /= markColl || p_a!(m_a!x) == p_b!(m_b!y))
288 then findSnakeRev h_a h_b p_a p_b m_a m_b (x - 1) (y - 1) off_a off_b
294 -- | try to create nicer diffs by shifting around regions of changed lines
295 shiftBoundaries :: BSTArray s -> BSTArray s -> PArray -> Int -> Int -> ST s ()
296 shiftBoundaries c_a c_b p_a i_ j_ =
297 do x <- nextChanged c_a i_
300 do let skipped = start - i_
301 j1 <- nextUnchangedN c_b skipped j_
302 end <- nextUnchanged c_a start
303 j2 <- nextUnchanged c_b j1
304 (i3,j3) <- expand start end j2
305 shiftBoundaries c_a c_b p_a i3 j3
306 Nothing -> return () -- no change up to end of file
307 where noline = (aLen p_a) + 1
309 do let len = i - start
310 (start0,i0,j0) <- shiftBackward start i j
311 b <- if j0 > 1 then readArray c_b (j0-1) else return False
312 let corr = if b then i0 else noline
313 let blank = if p_a!(i0-1) == nilPS then i0
315 (start1,i1,j1,corr1,blank1) <- shiftForward start0 i0 j0 corr blank
316 -- prefer corresponding to ending with blank line
317 let newi = if corr1 == noline then blank1
319 (start2,i2,j2) <- moveCorr start1 i1 j1 newi
320 if len /= i2 - start2
321 then expand start2 i2 j2
323 shiftBackward start i j =
324 if start > 1 && p_a!(i-1) == p_a!(start-1)
325 then do when (i == start) impossible
326 b1 <- readArray c_a (i-1)
327 b2 <- readArray c_a (start-1)
328 when ((not b1) || b2) impossible
329 writeArray c_a (i-1) False
330 writeArray c_a (start-1) True
331 b <- if start > 2 then readArray c_a (start-2)
333 start' <- if b then liftM (1+) (prevUnchanged c_a (start-2))
334 else return (start-1)
335 j' <- prevUnchanged c_b (j-1)
336 shiftBackward start' (i-1) j'
337 else do return (start,i,j)
338 shiftForward start i j corr blank =
339 if i <= aLen p_a && p_a!i == p_a!start &&
340 -- nilPS at the end of file marks empty line after final newline
341 not ((i == aLen p_a) && (p_a!i == nilPS))
342 then do when (i == start) impossible
343 b1 <- readArray c_a i
344 b2 <- readArray c_a start
345 when ((not b2) || b1) impossible
346 writeArray c_a i True
347 writeArray c_a start False
348 i0 <- nextUnchanged c_a (i+1)
349 j0 <- nextUnchanged c_b (j+1)
350 let corr0 = if i0 > (i+1) then noline
351 else if j0-j > 2 then i0 else corr
352 let blank0 = if i0 > i+1 then noline
353 else if p_a!(i0-1) == nilPS then i0
355 shiftForward (start+1) i0 j0 corr0 blank0
356 else do return (start,i,j,corr,blank)
357 moveCorr start i j corr =
359 then return (start,i,j)
360 else do b1 <- readArray c_a (i-1)
361 b2 <- readArray c_a (start-1)
362 when ((not b1) || b2) impossible
363 when (p_a!(i-1) /= p_a!(start-1)) impossible
364 writeArray c_a (i-1) False
365 writeArray c_a (start-1) True
366 j' <- prevUnchanged c_b (j-1)
367 moveCorr (start-1) (i-1) j' corr
369 -- | goto next unchanged line, return the given line if unchanged
370 nextUnchanged :: BSTArray s -> Int -> ST s Int
371 nextUnchanged c i = do
373 if i == len + 1 then return i
374 else do b <- readArray c i
375 if b then nextUnchanged c (i+1)
378 -- | skip at least one unchanged line, if there is none advance
379 -- behind the last line
380 skipOneUnChanged :: BSTArray s -> Int -> ST s Int
381 skipOneUnChanged c i = do
385 else do b <- readArray c i
386 if not b then return (i+1)
387 else skipOneUnChanged c (i+1)
389 -- | goto n-th next unchanged line
390 nextUnchangedN :: BSTArray s -> Int -> Int -> ST s Int
391 nextUnchangedN c n i = do
392 if n == 0 then return i
393 else do i' <- skipOneUnChanged c i
394 nextUnchangedN c (n-1) i'
396 -- | goto next changed line, return the given line if changed
397 nextChanged :: BSTArray s -> Int -> ST s (Maybe Int)
401 then do b <- readArray c i
402 if not b then nextChanged c (i+1)
406 -- | goto previous unchanged line, return the given line if unchanged
407 prevUnchanged :: BSTArray s -> Int -> ST s Int
408 prevUnchanged c i = do
410 if b then prevUnchanged c (i-1)
415 type HArray = UArray Int Int32
416 type BArray = UArray Int Bool
417 type PArray = Array Int PackedString
418 type MapArray = UArray Int Int
419 type VSTArray s = STUArray s Int Int
420 type BSTArray s = STUArray s Int Bool
422 initV :: Int -> ST s (VSTArray s)
424 newArray (-(dmax + 1), dmax + 1) (-1)
426 initVRev :: Int -> Int -> ST s (VSTArray s)
427 initVRev dmax xmax = do
428 newArray (-(dmax + 1), dmax + 1) (xmax + 1)
430 -- 1 indexed, v[0] is used as a guard element
431 initVChanged :: Int -> ST s (BSTArray s)
433 a <- newArray (0, l) True
436 -- set to false for all lines which have a mapping later
437 -- other lines are only present in one of the files
439 initH :: [Int32] -> HArray
440 initH a = listArray (0, length a) (0:a)
442 initM :: [Int] -> MapArray
443 initM a = listArray (0, length a) (0:a)
445 initP :: [PackedString] -> PArray
446 initP a = listArray (0, length a) (nilPS:a)
448 #if __GLASGOW_HASKELL__ > 604
449 aLen :: (IArray a e) => a Int e -> Int
450 aLen a = snd $ bounds a
451 aLenM :: (MArray a e m) => a Int e -> m Int
452 aLenM a = getBounds a >>= return . snd
454 aLen :: HasBounds a => a Int e -> Int
455 aLen a = snd $ bounds a
456 aLenM :: (HasBounds a, Monad m) => a Int e -> m Int
457 aLenM = return . snd . bounds
462 convertPatch :: Int -> PArray -> PArray -> (Int, Int, Int, Int)
463 -> (Int,[PackedString],[PackedString])
464 convertPatch off a b (a0,a1,b0,b1)
465 | b0 == b1 = (b0+off,getDelete a a0 a1,[])
466 | a0 == a1 = (b0+off,[],getInsert b b0 b1)
467 | otherwise = (b0+off,getDelete a a0 a1,getInsert b b0 b1)
469 getInsert :: PArray -> Int -> Int -> [PackedString]
472 | otherwise = (b!(from+1)):(getInsert b (from+1) to)
473 getDelete :: PArray -> Int -> Int -> [PackedString]
476 | otherwise = (a!(from+1)):(getDelete a (from+1) to)
478 createPatch :: BArray -> BArray -> [(Int, Int, Int, Int)]
479 createPatch c_a c_b =
480 reverse $ createP c_a c_b (aLen c_a) (aLen c_b)
482 createP :: BArray -> BArray -> Int -> Int -> [(Int, Int, Int, Int)]
484 createP c_a c_b ia ib =
486 then let ia' = skipChangedRev c_a ia
487 ib' = skipChangedRev c_b ib
488 in (ia',ia,ib',ib):(createP c_a c_b ia' ib')
489 else createP c_a c_b (ia-1) (ib-1)
491 skipChangedRev :: BArray -> Int -> Int
492 skipChangedRev c i = if i >= 0 && c!i then skipChangedRev c (i-1) else i