Am I an idiot or what! There were a few stupid errors in that diffing algorithm I threw up (give me a break: they don’t have python installed here in college). Here’s a corrected version. Again, any comments are welcome.
def GetDiff(oldLines, newLines):
"""Constructs a diff to get from newlines to oldlines."""
# Construct our special "hash" table. Using primes for the table length
# would be better, I know, but odd numbers are simpler.
tblSize = len(newLines) * 2 + 1
tbl = tuple([] * tblSize)
for iNew in range(len(newLines)):
iBucket = hash(newLines[iNew]) % tblSize
tbl[iBucket].append(iNew)
# Holds our diff information.
result = []
# Find the common subsequences and differences.
iOld = 0
iMax = 0
while iOld < len(oldLines):
iBucket = hash(oldLines[iOld]) % tblSize
# Start with no match
nMax = 0
for iNew in tbl[iBucket]:
# if nMax > 0 and iNew < iMax + nMax:
# continue
nCur = 0
try:
while newLines[iNew + nCur] == oldLines[iOld + nCur]:
nCur += 1
if nCur > nMax:
nMax = nCur
iMax = iNew
iEnd = iMax + nMax
if iEnd == len(oldLines) or iEnd == len(newLines):
break
# Have we found a matching subsequence?
if nMax == 0:
# Nope.
result.append(oldLines[iOld])
iOld += 1
else:
# Woohoo!
result.append((iMax, nMax))
iOld += nMax
return result
There old version is in my last entry.
