Made committer details and time = author details and time
[git-darcs-import.git] / src / Lcs.lhs
bloba4691c16c3a01f52f851172367cffd8af29424ad
1 % Copyright (C) 2002 David Roundy
2 % Copyright (C) 2005 Benedikt Schmidt
4 % This program is free software; you can redistribute it and/or modify
5 % it under the terms of the GNU General Public License as published by
6 % the Free Software Foundation; either version 2, or (at your option)
7 % any later version.
9 % This program is distributed in the hope that it will be useful,
10 % but WITHOUT ANY WARRANTY; without even the implied warranty of
11 % MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 % GNU General Public License for more details.
14 % You should have received a copy of the GNU General Public License
15 % along with this program; see the file COPYING. If not, write to
16 % the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
17 % Boston, MA 02110-1301, USA.
19 \chapter{LCS}
21 \section{Introduction}
22 ``LCS'' stands for ``Longest Common Subsequence,'' and it is a relatively
23 challenging problem to find an LCS efficiently. This module implements
24 the algorithm described in:
25 "An O(ND) Difference Algorithm and its Variations", Eugene Myers,
26 Algorithmica Vol. 1 No. 2, 1986, pp. 251-266;
27 especially the variation described in section 4.2 and most refinements
28 implemented in GNU diff (D is the edit-distance).
29 There is currently no heuristic to reduce the running time and produce
30 suboptimal output for large inputs with many differences. It behaves like
31 GNU diff with the -d option in this regard.
33 \begin{code}
34 {-# OPTIONS_GHC -cpp #-}
35 module Lcs ( getChanges, aLen,
36 BArray, PArray, BSTArray,
37 shiftBoundaries ) where
39 import Control.Monad
40 import Data.Int
41 import Control.Monad.ST
42 import Data.Maybe
43 import FastPackedString
44 import Data.Array.Base
45 import Data.Array.Unboxed
46 import qualified Data.Map as Map ( lookup, empty, insertWith )
47 #include "impossible.h"
49 \end{code}
51 In the first step, a hash value for every line is calculated and collisions
52 are marked with a special value. This reduces a string comparison to an
53 int comparison for line tuples where at least one of the hash values is
54 not equal to the special value. After that, lines which only exists in one
55 of the files are removed and marked as changed which reduces the running
56 time of the following difference algorithm. GNU diff additionally removes
57 lines that appear very often in the other file in some cases.
58 The last step tries to create longer changed regions and line up deletions
59 in the first file to insertions in the second by shifting changed lines
60 forward and backward.
62 \begin{code}
63 -- | create a list of changes between a and b, each change has the form
64 -- (starta, lima, startb, limb) which means that a[starta, lima)
65 -- has to be replaced by b[startb, limb)
66 getChanges :: [PackedString] -> [PackedString]
67 -> [(Int,[PackedString],[PackedString])]
68 getChanges a b = dropStart (initP a) (initP b) 1
70 dropStart :: PArray -> PArray -> Int
71 -> [(Int,[PackedString],[PackedString])]
72 dropStart a b off
73 | off > (aLen a) = [(off - 1, [], getSlice b off (aLen b))]
74 | off > (aLen b) = [(off - 1, getSlice a off (aLen a), [])]
75 | a!off == b!off = dropStart a b (off + 1)
76 | otherwise = dropEnd a b off 0
78 dropEnd :: PArray -> PArray -> Int -> Int
79 -> [(Int,[PackedString],[PackedString])]
80 dropEnd a b off end
81 | off > alast = [(off - 1, [], getSlice b off blast)]
82 | off > blast = [(off - 1, getSlice a off alast, [])]
83 | a!alast == b!blast = dropEnd a b off (end + 1)
84 | otherwise = getChanges' (a, (off, alast)) (b, (off, blast))
85 where alast = aLen a - end
86 blast = aLen b - end
88 getSlice :: PArray -> Int -> Int -> [PackedString]
89 getSlice a from to
90 | from > to = []
91 | otherwise = (a!(from)):(getSlice a (from + 1) to)
93 getChanges' :: (PArray, (Int, Int)) -> (PArray, (Int, Int))
94 -> [(Int,[PackedString],[PackedString])]
95 getChanges' (a, abounds) (b, bbounds) =
96 map (convertPatch 0 a b) $ createPatch c_a c_b
97 where toHash x bnds = listArray bnds [ hashPS $ x!i | i <- range bnds]
98 ah = toHash a abounds :: HArray
99 mkAMap m (i:is) =
100 let ins (_,_,_,new) (collision,_,_,old) =
101 (collision || not (new == old), True, False, old)
102 m' = Map.insertWith ins (ah!i) (False, True, False, a!i) m
103 in mkAMap m' is
104 mkAMap m _ = m
105 hm_a = mkAMap Map.empty (range abounds)
107 bh = toHash b bbounds :: HArray
108 mkBMap m (i:is) =
109 let ins (_,_,_,new) (collision,in_a,_,old) =
110 (collision || not (new == old), in_a, True, old)
111 m' = Map.insertWith ins (bh!i) (False, False, True, b!i) m
112 in mkBMap m' is
113 mkBMap m _ = m
114 hm = mkBMap hm_a (range bbounds)
115 -- take care of collisions, if there are different lines with the
116 -- same hash in both files, then set the hash to markColl,
117 -- PackedStrings are compared for two lines with the hash markColl
118 get (i, h) = case Map.lookup h hm of
119 Just (_,False,_,_) -> Nothing
120 Just (_,_,False,_) -> Nothing
121 Just (False,True,True,_) -> Just (i, h)
122 Just (True,True,True,_) -> Just (i, markColl)
123 Nothing -> impossible
125 a' = catMaybes $ map get [(i, ah!i) | i <- range (bounds ah)]
126 b' = catMaybes $ map get [(i, bh!i) | i <- range (bounds bh)]
128 (c_a, c_b) = diffArr a' b' (a, abounds) (b, bbounds)
130 -- | mark hash value where collision occured
131 markColl :: Int32
132 markColl = 2345677
134 -- | return arrays with changes in a and b (1 indexed), offsets start with 0
135 diffArr :: [(Int,Int32)] -> [(Int,Int32)]
136 -> (PArray, (Int, Int)) -> (PArray, (Int, Int))
137 -> (BArray, BArray)
138 diffArr a b (p_a, (off_a, l_a)) (p_b, (off_b, l_b)) = runST (
139 do let h_a = initH (map snd a)
140 h_b = initH (map snd b)
141 m_a = initM (map fst a)
142 m_b = initM (map fst b)
143 end_a = (aLen p_a)
144 end_b = (aLen p_b)
145 c_a <- initVChanged end_a
146 c_b <- initVChanged end_b
147 mapM_ (\ (l,_) -> writeArray c_a l False) $ a
148 mapM_ (\ (l,_) -> writeArray c_b l False) $ b
149 _ <- cmpseq h_a h_b p_a p_b m_a m_b c_a c_b 0 0 (aLen h_a) (aLen h_b)
150 let unchanged ar = do {xs <- getElems ar; return $ (length $ filter not $ xs) -1}
151 err <- liftM2 (/=) (unchanged c_a) (unchanged c_b)
152 when (err) impossible
153 -- Mark common lines at beginning and end
154 mapM_ (\ i -> writeArray c_a i False ) $ [1..(off_a - 1)]
155 mapM_ (\ i -> writeArray c_b i False ) $ [1..(off_b - 1)]
156 mapM_ (\ i -> writeArray c_a i False ) $ [(l_a + 1) .. (end_a)]
157 mapM_ (\ i -> writeArray c_b i False ) $ [(l_b + 1) .. (end_b)]
158 shiftBoundaries c_a c_b p_a 1 1
159 shiftBoundaries c_b c_a p_b 1 1
160 err1 <- liftM2 (/=) (unchanged c_a) (unchanged c_b)
161 when (err1) impossible
162 c_a' <- unsafeFreeze c_a
163 c_b' <- unsafeFreeze c_b
164 return (c_a', c_b'))
166 -- | set changes array for a and b and return number of changed lines
167 cmpseq :: HArray -> HArray -> PArray -> PArray -> MapArray -> MapArray
168 -> BSTArray s -> BSTArray s -> Int -> Int -> Int -> Int -> ST s Int
169 cmpseq _ _ _ _ _ _ _ _ _ _ 0 0 = do return 0
170 cmpseq h_a h_b p_a p_b m_a m_b c_a c_b off_a off_b l_a l_b = do
171 let lim_a = off_a+l_a
172 lim_b = off_b+l_b
173 off_a' = findSnake h_a h_b p_a p_b m_a m_b off_a off_b l_a l_b off_a off_b
174 off_b' = off_b+off_a'-off_a
175 lim_a' = findSnakeRev h_a h_b p_a p_b m_a m_b lim_a lim_b off_a' off_b'
176 lim_b' = lim_b+lim_a'-lim_a
177 l_a' = lim_a'-off_a'
178 l_b' = lim_b'-off_b'
179 if l_a' == 0 || l_b' == 0
180 then if l_a' == 0
181 then do when (l_b' > 0) $
182 mapM_ (\i -> writeArray c_b (m_b!i) True)
183 [(off_b' + 1) .. lim_b']
184 return l_b'
185 else do when (l_a' > 0) $
186 mapM_ (\i -> writeArray c_a (m_a!i) True)
187 [(off_a' + 1) .. lim_a']
188 return l_a'
189 else do let m = l_a' + l_b'
190 del = l_a' - l_b'
191 dodd = odd $ del
192 v <- initV m
193 vrev <- initVRev m l_a'
194 writeArray vrev 0 l_a'
195 writeArray v 0 0
196 (xmid, ymid, _) <- findDiag 1 h_a h_b p_a p_b m_a m_b v vrev
197 off_a' off_b' l_a' l_b' del dodd
198 when ((xmid == 0 && ymid == 0) || (xmid == l_a' && ymid == l_b')
199 || (xmid < 0 || ymid < 0 || xmid > l_a' || ymid > l_b'))
200 impossible
201 c1 <- cmpseq h_a h_b p_a p_b m_a m_b c_a c_b
202 off_a' off_b' xmid ymid
203 c2 <- cmpseq h_a h_b p_a p_b m_a m_b c_a c_b
204 (off_a' + xmid) (off_b' + ymid)
205 (l_a' - xmid) (l_b' - ymid)
206 return $ c1 + c2
208 -- | return (xmid, ymid, cost) for the two substrings
209 -- a[off_a+1..off_a+1+l_a] and b
210 findDiag :: Int -> HArray -> HArray -> PArray -> PArray -> MapArray -> MapArray
211 -> VSTArray s -> VSTArray s -> Int -> Int -> Int -> Int -> Int -> Bool
212 -> ST s (Int, Int, Int)
213 findDiag c h_a h_b p_a p_b m_a m_b v vrev off_a off_b l_a l_b del dodd = do
214 if c > l_a + l_b then error "findDiag failed" else return ()
215 r <- findF
216 case r of
217 Just (xmid, ymid) -> return (xmid, ymid, (c*2 - 1))
218 Nothing ->
219 do r' <- findR
220 case r' of
221 Just (xmid, ymid) -> return (xmid, ymid, c*2)
222 Nothing -> findDiag (c + 1) h_a h_b p_a p_b m_a m_b v vrev
223 off_a off_b l_a l_b del dodd
224 where fdmax = if c <= l_a then c else l_a - ((l_a + c) `mod` 2)
225 rdmax = if c <= l_b then c else l_b - ((l_b + c) `mod` 2)
226 lastrdmax = if (c-1) <= l_b then c-1 else l_b-((l_b + (c-1) `mod` 2))
227 lastrdmin = -(if (c-1) <= l_a then c-1 else l_a-((l_a + (c-1)) `mod` 2))
228 fdmin = -rdmax
229 rdmin = -fdmax
230 findF = findF' fdmax
231 findR = findR' rdmax
232 findF' d = do x <- findOne h_a h_b p_a p_b m_a m_b v d off_a off_b l_a l_b
233 if dodd && d - del >= lastrdmin && d - del <= lastrdmax
234 then do xr <- readArray vrev (d - del)
235 if xr <= x then return $ Just (x, x - d)
236 else if d <= fdmin then return Nothing
237 else findF' (d-2)
238 else if d <= fdmin then return Nothing else findF' (d-2)
239 findR' d = do x <- findOneRev h_a h_b p_a p_b m_a m_b vrev d del off_a off_b
240 if not dodd && (d + del >= fdmin) && (d + del <= fdmax)
241 then do xf <- readArray v (d + del)
242 if x <= xf then return $ Just (x,x-del-d)
243 else if d <= rdmin then return Nothing
244 else findR' (d-2)
245 else if d <= rdmin then return Nothing else findR' (d-2)
247 -- | find position on diag d with one more insert/delete going forward
248 findOne :: HArray -> HArray -> PArray -> PArray -> MapArray -> MapArray
249 -> VSTArray s -> Int -> Int -> Int -> Int -> Int -> ST s Int
250 findOne h_a h_b p_a p_b m_a m_b v d off_a off_b l_a l_b = do
251 x0 <- do xbelow <- readArray v (d - 1)
252 xover <- readArray v (d + 1)
253 return $ if xover > xbelow then xover else xbelow + 1
254 let y0 = x0 - d
255 x = findSnake h_a h_b p_a p_b m_a m_b (x0+off_a) (y0+off_b)
256 l_a l_b off_a off_b
257 writeArray v d (x - off_a)
258 return (x-off_a)
260 -- | follow snake from northwest to southeast, x and y are absolute positions
261 findSnake :: HArray -> HArray -> PArray -> PArray -> MapArray -> MapArray
262 -> Int -> Int -> Int -> Int -> Int -> Int -> Int
263 findSnake h_a h_b p_a p_b m_a m_b x y l_a l_b off_a off_b =
264 if x < l_a + off_a && y < l_b + off_b && h_a!(x+1) == h_b!(y+1)
265 && (h_a!(x+1) /= markColl || p_a!(m_a!(x+1)) == p_b!(m_b!(y+1)))
266 then findSnake h_a h_b p_a p_b m_a m_b (x + 1) (y + 1) l_a l_b off_a off_b
267 else x
269 -- | find position on diag d with one more insert/delete going backward
270 findOneRev :: HArray -> HArray -> PArray -> PArray -> MapArray -> MapArray
271 -> VSTArray s -> Int -> Int -> Int -> Int -> ST s Int
272 findOneRev h_a h_b p_a p_b m_a m_b v d del off_a off_b = do
273 x0 <- do xbelow <- readArray v (d - 1)
274 xover <- readArray v (d + 1)
275 return $ if xbelow < xover then xbelow else xover-1
276 let y0 = x0 - del - d
277 x = findSnakeRev h_a h_b p_a p_b m_a m_b (x0+off_a) (y0+off_b)
278 off_a off_b
279 writeArray v d (x-off_a)
280 return (x-off_a)
282 -- | follow snake from southeast to northwest, x and y are absolute positions
283 findSnakeRev :: HArray -> HArray -> PArray -> PArray -> MapArray -> MapArray
284 -> Int -> Int -> Int -> Int -> Int
285 findSnakeRev h_a h_b p_a p_b m_a m_b x y off_a off_b =
286 if x > off_a && y > off_b && h_a!x == h_b!y
287 && (h_a!x /= markColl || p_a!(m_a!x) == p_b!(m_b!y))
288 then findSnakeRev h_a h_b p_a p_b m_a m_b (x - 1) (y - 1) off_a off_b
289 else x
290 \end{code}
292 \begin{code}
294 -- | try to create nicer diffs by shifting around regions of changed lines
295 shiftBoundaries :: BSTArray s -> BSTArray s -> PArray -> Int -> Int -> ST s ()
296 shiftBoundaries c_a c_b p_a i_ j_ =
297 do x <- nextChanged c_a i_
298 case x of
299 Just start ->
300 do let skipped = start - i_
301 j1 <- nextUnchangedN c_b skipped j_
302 end <- nextUnchanged c_a start
303 j2 <- nextUnchanged c_b j1
304 (i3,j3) <- expand start end j2
305 shiftBoundaries c_a c_b p_a i3 j3
306 Nothing -> return () -- no change up to end of file
307 where noline = (aLen p_a) + 1
308 expand start i j =
309 do let len = i - start
310 (start0,i0,j0) <- shiftBackward start i j
311 b <- if j0 > 1 then readArray c_b (j0-1) else return False
312 let corr = if b then i0 else noline
313 let blank = if p_a!(i0-1) == nilPS then i0
314 else noline
315 (start1,i1,j1,corr1,blank1) <- shiftForward start0 i0 j0 corr blank
316 -- prefer corresponding to ending with blank line
317 let newi = if corr1 == noline then blank1
318 else corr1
319 (start2,i2,j2) <- moveCorr start1 i1 j1 newi
320 if len /= i2 - start2
321 then expand start2 i2 j2
322 else return (i2, j2)
323 shiftBackward start i j =
324 if start > 1 && p_a!(i-1) == p_a!(start-1)
325 then do when (i == start) impossible
326 b1 <- readArray c_a (i-1)
327 b2 <- readArray c_a (start-1)
328 when ((not b1) || b2) impossible
329 writeArray c_a (i-1) False
330 writeArray c_a (start-1) True
331 b <- if start > 2 then readArray c_a (start-2)
332 else return False
333 start' <- if b then liftM (1+) (prevUnchanged c_a (start-2))
334 else return (start-1)
335 j' <- prevUnchanged c_b (j-1)
336 shiftBackward start' (i-1) j'
337 else do return (start,i,j)
338 shiftForward start i j corr blank =
339 if i <= aLen p_a && p_a!i == p_a!start &&
340 -- nilPS at the end of file marks empty line after final newline
341 not ((i == aLen p_a) && (p_a!i == nilPS))
342 then do when (i == start) impossible
343 b1 <- readArray c_a i
344 b2 <- readArray c_a start
345 when ((not b2) || b1) impossible
346 writeArray c_a i True
347 writeArray c_a start False
348 i0 <- nextUnchanged c_a (i+1)
349 j0 <- nextUnchanged c_b (j+1)
350 let corr0 = if i0 > (i+1) then noline
351 else if j0-j > 2 then i0 else corr
352 let blank0 = if i0 > i+1 then noline
353 else if p_a!(i0-1) == nilPS then i0
354 else blank
355 shiftForward (start+1) i0 j0 corr0 blank0
356 else do return (start,i,j,corr,blank)
357 moveCorr start i j corr =
358 if corr >= i
359 then return (start,i,j)
360 else do b1 <- readArray c_a (i-1)
361 b2 <- readArray c_a (start-1)
362 when ((not b1) || b2) impossible
363 when (p_a!(i-1) /= p_a!(start-1)) impossible
364 writeArray c_a (i-1) False
365 writeArray c_a (start-1) True
366 j' <- prevUnchanged c_b (j-1)
367 moveCorr (start-1) (i-1) j' corr
369 -- | goto next unchanged line, return the given line if unchanged
370 nextUnchanged :: BSTArray s -> Int -> ST s Int
371 nextUnchanged c i = do
372 len <- aLenM c
373 if i == len + 1 then return i
374 else do b <- readArray c i
375 if b then nextUnchanged c (i+1)
376 else return i
378 -- | skip at least one unchanged line, if there is none advance
379 -- behind the last line
380 skipOneUnChanged :: BSTArray s -> Int -> ST s Int
381 skipOneUnChanged c i = do
382 len <- aLenM c
383 if i == len + 1
384 then return i
385 else do b <- readArray c i
386 if not b then return (i+1)
387 else skipOneUnChanged c (i+1)
389 -- | goto n-th next unchanged line
390 nextUnchangedN :: BSTArray s -> Int -> Int -> ST s Int
391 nextUnchangedN c n i = do
392 if n == 0 then return i
393 else do i' <- skipOneUnChanged c i
394 nextUnchangedN c (n-1) i'
396 -- | goto next changed line, return the given line if changed
397 nextChanged :: BSTArray s -> Int -> ST s (Maybe Int)
398 nextChanged c i = do
399 len <- aLenM c
400 if i <= len
401 then do b <- readArray c i
402 if not b then nextChanged c (i+1)
403 else return $ Just i
404 else return Nothing
406 -- | goto previous unchanged line, return the given line if unchanged
407 prevUnchanged :: BSTArray s -> Int -> ST s Int
408 prevUnchanged c i = do
409 b <- readArray c i
410 if b then prevUnchanged c (i-1)
411 else return i
412 \end{code}
414 \begin{code}
415 type HArray = UArray Int Int32
416 type BArray = UArray Int Bool
417 type PArray = Array Int PackedString
418 type MapArray = UArray Int Int
419 type VSTArray s = STUArray s Int Int
420 type BSTArray s = STUArray s Int Bool
422 initV :: Int -> ST s (VSTArray s)
423 initV dmax = do
424 newArray (-(dmax + 1), dmax + 1) (-1)
426 initVRev :: Int -> Int -> ST s (VSTArray s)
427 initVRev dmax xmax = do
428 newArray (-(dmax + 1), dmax + 1) (xmax + 1)
430 -- 1 indexed, v[0] is used as a guard element
431 initVChanged :: Int -> ST s (BSTArray s)
432 initVChanged l = do
433 a <- newArray (0, l) True
434 writeArray a 0 False
435 return a
436 -- set to false for all lines which have a mapping later
437 -- other lines are only present in one of the files
439 initH :: [Int32] -> HArray
440 initH a = listArray (0, length a) (0:a)
442 initM :: [Int] -> MapArray
443 initM a = listArray (0, length a) (0:a)
445 initP :: [PackedString] -> PArray
446 initP a = listArray (0, length a) (nilPS:a)
448 #if __GLASGOW_HASKELL__ > 604
449 aLen :: (IArray a e) => a Int e -> Int
450 aLen a = snd $ bounds a
451 aLenM :: (MArray a e m) => a Int e -> m Int
452 aLenM a = getBounds a >>= return . snd
453 #else
454 aLen :: HasBounds a => a Int e -> Int
455 aLen a = snd $ bounds a
456 aLenM :: (HasBounds a, Monad m) => a Int e -> m Int
457 aLenM = return . snd . bounds
458 #endif
459 \end{code}
461 \begin{code}
462 convertPatch :: Int -> PArray -> PArray -> (Int, Int, Int, Int)
463 -> (Int,[PackedString],[PackedString])
464 convertPatch off a b (a0,a1,b0,b1)
465 | b0 == b1 = (b0+off,getDelete a a0 a1,[])
466 | a0 == a1 = (b0+off,[],getInsert b b0 b1)
467 | otherwise = (b0+off,getDelete a a0 a1,getInsert b b0 b1)
469 getInsert :: PArray -> Int -> Int -> [PackedString]
470 getInsert b from to
471 | from >= to = []
472 | otherwise = (b!(from+1)):(getInsert b (from+1) to)
473 getDelete :: PArray -> Int -> Int -> [PackedString]
474 getDelete a from to
475 | from >= to = []
476 | otherwise = (a!(from+1)):(getDelete a (from+1) to)
478 createPatch :: BArray -> BArray -> [(Int, Int, Int, Int)]
479 createPatch c_a c_b =
480 reverse $ createP c_a c_b (aLen c_a) (aLen c_b)
482 createP :: BArray -> BArray -> Int -> Int -> [(Int, Int, Int, Int)]
483 createP _ _ 0 0 = []
484 createP c_a c_b ia ib =
485 if c_a!ia || c_b!ib
486 then let ia' = skipChangedRev c_a ia
487 ib' = skipChangedRev c_b ib
488 in (ia',ia,ib',ib):(createP c_a c_b ia' ib')
489 else createP c_a c_b (ia-1) (ib-1)
491 skipChangedRev :: BArray -> Int -> Int
492 skipChangedRev c i = if i >= 0 && c!i then skipChangedRev c (i-1) else i
493 \end{code}