/** * An as3 port of the Bitap algorithm for fuzzy string matching * from the javascript version of google-diff-match-patch * @author Hristo Dachev http://blog.controul.com/ * * See: * * Neil Fraser http://neil.fraser.name/ * http://code.google.com/p/google-diff-match-patch/ * Original license follows. */ /** * Diff Match and Patch * * Copyright 2006 Google Inc. * http://code.google.com/p/google-diff-match-patch/ * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.controul.utils.string { public class Bitap { /** * The maximum valid pattern length. */ public const maxPatternLength : int = 32; /** * Tweak the relative importance (0.0 = accuracy, 1.0 = proximity) */ public var matchBalance : Number = 0.5; /** * At what point is no match declared (0.0 = perfection, 1.0 = very loose) */ public var matchThreshold : Number = 0.5; /** * The min cutoff used when computing text lengths. */ public var matchMinLength : int = 100; /** * The max cutoff used when computing text lengths. */ public var matchMaxLength : int = 1000; /** * Locate the best instance of 'pattern' in 'text' near 'loc' using the * Bitap algorithm. Returns -1 if no match found. * @param text The text to search. * @param pattern The pattern to search for. * @param loc The location to search around. * @return Best match index or -1. */ public function match ( text : String, pattern : String, loc : int ) : int { var patternLength : int = pattern.length, textLength : int = text.length; if ( patternLength > maxPatternLength ) throw new Error ( this + ' @ Bitap.match () : "' + pattern + '" is longer than 32 characters.' ); // Highest score beyond which we give up. var scoreThreshold : Number = matchThreshold; // Is there a nearby exact match? (speedup) var best : int = text.indexOf ( pattern, loc ); if ( best != -1 ) { // Exact match, exact location. if ( best == loc ) return loc; scoreThreshold = Math.min ( score ( 0, best, loc, scoreTextLength, patternLength ), scoreThreshold ); } // What about in the other direction? (speedup) best = text.lastIndexOf ( pattern, loc ); if ( best != -1 ) { scoreThreshold = Math.min ( score ( 0, best, loc, scoreTextLength, patternLength ), scoreThreshold ); } // Initialise the alphabet. var s : Object = {}, char : String, v : uint, i : int, l : int = patternLength - 1; for ( i = 0; i < patternLength; i ++ ) { char = pattern.charAt ( i ); v = s [ char ]; v |= 1 << ( l - i ); s [ char ] = v; } // Coerce the text length between reasonable maximums and minimums. var scoreTextLength : int = textLength; if ( scoreTextLength > matchMaxLength ) scoreTextLength = matchMaxLength; else if ( scoreTextLength < matchMinLength ) scoreTextLength = matchMinLength; // Initialise the bit arrays. best = -1; var matchmask : int = 1 << ( patternLength - 1 ), binMin : int, binMid : int, binMax : int = Math.max ( loc + loc, textLength ); // Hash character values. var schar : uint, rd : Vector. = bufferA, last_rd : Vector. = bufferB, temp_rd : Vector.; if ( !rd ) { bufferA = rd = new Vector.; bufferB = last_rd = new Vector.; } // Scan for the best match; each iteration allows for one more error. var d : int; for ( d = 0; d < patternLength; d++ ) { // Run a binary search to determine how far from 'loc' we can stray at this error level. binMin = loc; binMid = binMax; while ( binMin < binMid ) { if ( score ( d, binMid, loc, scoreTextLength, patternLength ) < scoreThreshold ) binMin = binMid; else binMax = binMid; binMid = ( binMax + binMin ) / 2; } // Use the result from this iteration as the maximum for the next. binMax = binMid; var start : int = loc - ( binMid - loc ) - 1, finish : int = textLength - 1; if ( start < 0 ) start = 0; if ( finish > patternLength + binMid ) finish = patternLength + binMid; rd.length = finish; if ( text.charAt ( finish ) == pattern.charAt ( patternLength - 1 ) ) rd [ finish ] = ( 1 << ( d + 1 ) ) - 1; else rd [ finish ] = ( 1 << d ) - 1; var j : int; for ( j = finish - 1; j >= start; j-- ) { schar = s [ text.charAt ( j ) ]; // First pass: exact match. if ( d == 0 ) rd [ j ] = ( ( rd [ int ( j + 1 ) ] << 1 ) | 1 ) & schar; // Subsequent passes: fuzzy match. else rd [ j ] = ( ( rd [ int ( j + 1 ) ] << 1 ) | 1 ) & schar | ( ( last_rd [ int ( j + 1 ) ] << 1 ) | 1 ) | ( ( last_rd [ j ] << 1 ) | 1 ) | last_rd [ int ( j + 1 ) ]; if ( ( rd [ j ] & matchmask ) != 0 ) { // This match will almost certainly be better than any existing // match. But check anyway. var scr : Number = score ( d, j, loc, scoreTextLength, patternLength ); if ( scr <= scoreThreshold ) { // Told you so. scoreThreshold = scr; best = j; if ( j > loc ) { // When passing loc, don't exceed our current distance from loc. start = loc - ( j - loc ); if ( start < 0 ) start = 0; } else // Already passed loc, downhill from here on in. break; } } } if ( score ( d + 1, loc, loc, scoreTextLength, patternLength ) > scoreThreshold ) // No hope for a (better) match at greater error levels. break; temp_rd = last_rd; last_rd = rd; rd = temp_rd; } // Clean-up. rd.length = 0; last_rd.length = 0; return best; } /** * Compute and return the score for a match with e errors and x location. * @param e Number of errors in match. * @param x Location of match. * @param loc Expected location of match. * @param scoreTextLength Coerced version of text's length. * @param pattern Pattern being sought. * @return Overall score for match. */ private function score ( e : Number, x : Number, loc : Number, scoreTextLength : Number, patternLength : Number ) : Number { return ( e / patternLength / matchBalance ) + ( ( loc > x ? loc - x : x - loc ) / scoreTextLength / ( 1.0 - matchBalance ) ); } // Reusable objects. private static var bufferA : Vector.; private static var bufferB : Vector.; } }