%  Copyright (C) 2002 David Roundy
%
%  This program is free software; you can redistribute it and/or modify
%  it under the terms of the GNU General Public License as published by
%  the Free Software Foundation; either version 2, or (at your option)
%  any later version.
%
%  This program is distributed in the hope that it will be useful,
%  but WITHOUT ANY WARRANTY; without even the implied warranty of
%  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
%  GNU General Public License for more details.
%
%  You should have received a copy of the GNU General Public License
%  along with this program; if not, write to the Free Software Foundation,
%  Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
\chapter{LCS}

\section{Introduction}

``LCS'' stands for ``Longest Common Subsequence,'' and it is a relatively
challenging problem to find an LCS efficiently.  I'm not going to explain
here what an LCS is, but will point out that it is useful in finding how
two sequences (lists, in this case) differ.  This module implements the
Hunt-Szymanski algorithm, which is appropriate for applications in which
the sequence is on an infinite alphabet, such as diffing the lines in two
files, where many, or most lines are unique.  In the best case scenario, a
permutation of unique lines, this algorithm is $O(n\log n)$.  In the worst
case scenario, that of a finite alphabet (i.e. where the number of elements
in the sequence is much greater than the number of unique elements), it is
an $O(n^2\log n)$ algorithm, which is pretty terrible.

You should probably be aware that most diff programs do \emph{not} find an
lcs.  Instead they use a faster algorithm, which doesn't give an optimal
diff, but does give one quickly.

\begin{code}
module Lcs ( lcs, subtract_subsequence ) where

import List ( sort )
import MArray
import FastPackedString
import ST

lcs :: Ord a => [a] -> [a] -> [a]
{-# SPECIALIZE lcs ::[String] -> [String] -> [String] #-}
{-# SPECIALIZE lcs ::[PackedString] -> [PackedString] -> [PackedString] #-}
\end{code}

In order to make use of our lcs, we will want to be able to remove it from
the strings (to see what has changed).  That is what subtract\_substring
does for us.

\begin{code}
subtract_subsequence :: Eq a => [a] -> [a] -> [a]
subtract_subsequence s [] = s
subtract_subsequence [] _ = []
subtract_subsequence (c:cs) (s:ss)
    | c == s = subtract_subsequence cs ss
    | otherwise = c : subtract_subsequence cs (s:ss)
\end{code}

\begin{code}
lcs [] _ = []
lcs _ [] = []
lcs (c1:c1s) (c2:c2s)
    | c1 == c2 = c1: lcs c1s c2s
    | otherwise =
        reverse $ lcs_simple (reverse (c1:c1s)) (reverse (c2:c2s))

lcs_simple :: Ord a => [a] -> [a] -> [a]
lcs_simple [] _ = []
lcs_simple _ [] = []
lcs_simple (c1:c1s) (c2:c2s)
    | c1 == c2 = c1: lcs c1s c2s
    | otherwise =
        case unzip $ prune_matches s1 $! find_matches s1 s2 of
        (s1',m1') -> hunt s1' m1'
        where s1 = (c1:c1s)
              s2 = (c2:c2s)

prune_matches _ [] = []
prune_matches [] _ = []
prune_matches (c:cs) ([]:ms) = prune_matches cs ms
prune_matches (c:cs) (m:ms) = (c,m): prune_matches cs ms

type Threshold s a = STArray s Int (Int,[a])

hunt :: [a] -> [[Int]] -> [a]
hunt [] _ = []
hunt cs matches =
    runST (do
           th <- empty_threshold (length cs) l
           hunt_internal cs matches th
           hunt_recover th (-1) l
          )
    where m = matches
          l = foldl (\x y->max x (foldl max 0 y)) 0 m

hunt_internal :: [a] -> [[Int]] -> Threshold s a ->
                 ST s ()
hunt_internal [] _ th = return ()
hunt_internal _ [] th = return ()
hunt_internal (c:cs) (m:ms) th = do
    hunt_one_char c m th
    hunt_internal cs ms th

hunt_one_char :: a -> [Int] ->  Threshold s a ->
                 ST s ()
hunt_one_char c [] th = return ()
hunt_one_char c (j:js) th = do
    index_k <- my_bs j th
    case index_k of
      Nothing -> return ()
      Just k -> do
        (_, rest) <- readArray th (k-1)
        writeArray th k (j, c:rest)
    hunt_one_char c js th

-- This is O(n), which is stupid.
hunt_recover :: Threshold s a -> Int -> Int -> ST s [a]
hunt_recover th n limit
    | n < 0 = hunt_recover th max limit
    | n == 0 = return []
    | n > max = return []
    | otherwise = do
        (thn, sn) <- readArray th n
        if thn <= limit then return $ reverse sn
                        else hunt_recover th (n-1) limit
    where (min, max) = bounds th

empty_threshold :: Int -> Int -> ST s (Threshold s a)
empty_threshold l max = do
  th <- newArray (0,l) (max+1, [])
  writeArray th 0 (0, [])
  return th

my_bs :: Int -> Threshold s a -> ST s (Maybe Int)
my_bs j th = my_helper_bs j (min,max) th
             where (min,max) = bounds th

my_helper_bs :: Int -> (Int,Int) -> Threshold s a ->
                ST s (Maybe Int)
my_helper_bs j (min,max) th =
    if max - min > 1 then do
       (midth, _) <- readArray th middle
       if j > midth
         then my_helper_bs j (middle,max) th
         else my_helper_bs j (min,middle) th
    else do
       (minth, _) <- readArray th min
       (maxth, _) <- readArray th max
       if minth < j && maxth > j
          then return $ Just max
          else if j < minth then return $ Just min
               else return Nothing
    where middle = min + (max-min)`div`2

\end{code}


\begin{code}
find_matches :: Ord a => [a] -> [a] -> [[Int]]
find_matches [] [] = []
find_matches [] (b:bs) = []: find_matches [] bs
find_matches _ [] = []
find_matches a b =
    unzip_indexed $ sort $ find_sorted_matches indexeda indexedb [] []
    where indexeda = sort $ zip a [1..]
          indexedb = sort $ zip b [1..]

unzip_indexed :: [(Int,[a])] -> [[a]]
unzip_indexed s = unzip_indexed_helper 1 s
unzip_indexed_helper _ [] = []
unzip_indexed_helper thisl ((l,c):rest)
    | thisl == l = c: unzip_indexed_helper (l+1) rest
    | otherwise = []: unzip_indexed_helper (thisl+1) ((l,c):rest)

find_sorted_matches [] _ _ _ = []
find_sorted_matches _ [] _ _ = []
find_sorted_matches ((a,na):as) ((b,nb):bs) aold aoldmatches
    | [a] == aold = (na, aoldmatches) :
                  find_sorted_matches as ((b,nb):bs) aold aoldmatches
    | a > b = find_sorted_matches ((a,na):as) bs aold aoldmatches
    | a < b = find_sorted_matches as ((b,nb):bs) aold aoldmatches
-- following line is inefficient if a line is repeated many times.
    | a == b =
        case reverse $ find_matches_one a ((b,nb):bs) of
        matches -> (na, matches) :
                   find_sorted_matches as ((b,nb):bs) [a] matches

find_matches_one _ [] = []
find_matches_one a ((b,nb):bs)
    | a < b = []
    | a == b = nb: find_matches_one a bs
    | a > b = []
\end{code}
