首页  编辑  

比较字符串的近似程度

Tags: /超级猛料/String.字符串处理/   Date Created:

比较字符串的近似程度

{

Compares two strings in percent (how they are similar to each other)

Returns byte value from 0 to 100%

examples:

var

 Percent: byte;

begin

 Percent := CompareStringsInPercent('this is a test', 'This is another test'); // 37%

 Percent := CompareStringsInPercent('this is some string', 'and yet another some string'); // 24%

 Percent := CompareStringsInPercent('abcde', 'fghij'); // 0%

 Percent := CompareStringsInPercent('1.jpg', '2.jpg'); // 81%

...

}

function CompareStringsInPercent(Str1, Str2: string): Byte;

type

 TLink = array[0..1] of Byte;

var

 tmpPattern: TLink;

 PatternA, PatternB: array of TLink;

 IndexA, IndexB, LengthStr: Integer;

begin

 Result := 100;

 // Building pattern tables

 LengthStr := Max(Length(Str1), Length(Str2));

 for IndexA := 1 to LengthStr do

 begin

   if Length(Str1) >= IndexA then

   begin

     SetLength(PatternA, (Length(PatternA) + 1));

     PatternA[Length(PatternA) - 1][0] := Byte(Str1[IndexA]);

     PatternA[Length(PatternA) - 1][1] := IndexA;

   end;

   if Length(Str2) >= IndexA then

   begin

     SetLength(PatternB, (Length(PatternB) + 1));

     PatternB[Length(PatternB) - 1][0] := Byte(Str2[IndexA]);

     PatternB[Length(PatternB) - 1][1] := IndexA;

   end;

 end;

 // Quick Sort of pattern tables

 IndexA := 0;

 IndexB := 0;

 while ((IndexA < (Length(PatternA) - 1)) and (IndexB < (Length(PatternB) - 1))) do

 begin

   if Length(PatternA) > IndexA then

   begin

     if PatternA[IndexA][0] < PatternA[IndexA + 1][0] then

     begin

       tmpPattern[0]           := PatternA[IndexA][0];

       tmpPattern[1]           := PatternA[IndexA][1];

       PatternA[IndexA][0]     := PatternA[IndexA + 1][0];

       PatternA[IndexA][1]     := PatternA[IndexA + 1][1];

       PatternA[IndexA + 1][0] := tmpPattern[0];

       PatternA[IndexA + 1][1] := tmpPattern[1];

       if IndexA > 0 then Dec(IndexA);

     end

     else

       Inc(IndexA);

   end;

   if Length(PatternB) > IndexB then

   begin

     if PatternB[IndexB][0] < PatternB[IndexB + 1][0] then

     begin

       tmpPattern[0]           := PatternB[IndexB][0];

       tmpPattern[1]           := PatternB[IndexB][1];

       PatternB[IndexB][0]     := PatternB[IndexB + 1][0];

       PatternB[IndexB][1]     := PatternB[IndexB + 1][1];

       PatternB[IndexB + 1][0] := tmpPattern[0];

       PatternB[IndexB + 1][1] := tmpPattern[1];

       if IndexB > 0 then Dec(IndexB);

     end

     else

       Inc(IndexB);

   end;

 end;

 // Calculating simularity percentage

 LengthStr := Min(Length(PatternA), Length(PatternB));

 for IndexA := 0 to (LengthStr - 1) do

 begin

   if PatternA[IndexA][0] = PatternB[IndexA][0] then

   begin

     if Max(PatternA[IndexA][1], PatternB[IndexA][1]) - Min(PatternA[IndexA][1],

       PatternB[IndexA][1]) > 0 then Dec(Result,

       ((100 div LengthStr) div (Max(PatternA[IndexA][1], PatternB[IndexA][1]) -

         Min(PatternA[IndexA][1], PatternB[IndexA][1]))))

     else if Result < 100 then Inc(Result);

   end

   else

     Dec(Result, (100 div LengthStr))

 end;

 SetLength(PatternA, 0);

 SetLength(PatternB, 0);

end;

---------------------------------------

function Max(i1,i2:Integer):Integer;

begin

 if i1>=i2 then

   Result:=i1

 else

   Result:=i2;

end;

function StrSimilar (s1, s2: string; tolerant: Boolean): Integer;

var hit: Integer; // Number of identical chars

   p1, p2: Integer; // Position count

   l1, l2, l: Integer; // Length of strings

   diff: Integer; // unsharp factor

   hstr: string; // help var for swapping strings

   // Array shows if position is already tested

   test: Classes.TBits;

function CompChar (ch1, ch2: Char): Boolean;

// german "umlauts" and similar charactes

begin

 if tolerant then begin

  ch1:= UpCase (ch1); // compare case insensitive

  ch2:= UpCase (ch2);

  case ch1 of

   'A', 'E': Result:= ch2 in ['A', 'E'];

   'B', 'P': Result:= ch2 in ['B', 'P'];

   'C', 'Z': Result:= ch2 in ['C', 'Z'];

   'D', 'T': Result:= ch2 in ['D', 'T'];

   'F', 'V': Result:= ch2 in ['F', 'V'];

   'G', 'K': Result:= ch2 in ['G', 'K'];

   'S': Result:= ch2 in ['S'];

   'I', 'J',

   'Y': Result:= ch2 in ['I', 'J', 'Y'];

   else Result:= ch1 = ch2;

  end;

 end else begin

  Result:= ch1 = ch2;

 end;

end;

begin

l1:= Length (s1);

l2:= Length (s2);

if (l1 <= 0) or (l2 <= 0) then begin Result:= 0; Exit; end;

// Test Length and swap, if s1 is smaller

if l1 < l2 then begin

 hstr:= s2;  s2:= s1;  s1:= hstr;

 l:= l2; l2:= l1; l1:= l;

end;

p1:= 1;  p2:= 1;  hit:= 0;

// calc the unsharp factor depending on

// the length of the strings

diff:= Max (l1, l2) div 3 + ABS (l1 - l2);

// init the test array

test:= Classes.TBits.Create;

// Calc size of TBits. Must be two bigger, because we're 0-Based

// counting from 1, and we need one more then stringlength

test.Size:= l1 + 2;

// loop through the string

repeat

 // position tested?

 if not test.Bits[p1] then begin

  // found a matching character?

  if CompChar (s1[p1], s2[p2]) and

              (ABS(p1-p2) <= diff) then begin

   test.Bits[p1]:= True;

   Inc (hit); // increment the hit count

   // next positions

   Inc (p1); Inc (p2);

   if p1 > l1 then p1:= 1;

  end else begin

   // Set test array

   test.Bits[p1]:= False;

   Inc (p1);

   // Loop back to next test position

   if p1 > l1 then begin

    while (p1 > 1) and not (test[p1]) do Dec (p1);

    Inc (p2)

   end;

  end;

 end else begin

  Inc (p1);

  // Loop back to next test position

  if p1 > l1 then begin

   repeat Dec (p1); until (p1 = 1) or test.Bits[p1];

   Inc (p2);

  end;

 end;

until p2 > l2;

test.Free; // Release Booleanlist

// calc procentual value

Result:= 100 * hit DIV l1;

end;

procedure TForm1.Button1Click(Sender: TObject);

begin

 Caption:=IntToStr(StrSimilar(Edit1.Text,Edit2.Text,true));

end;

eg:

 Edit1.Text is 福建省厦门市中山路11号

 if Edit2.Text is 福建厦门中山路11号     -> 81

 if Edit2.Text is 福建厦门中山路11       -> 72

 if Edit2.Text is 福建厦门中山路         -> 63

 if Edit2.Text is 厦门市福建省中山路11号 -> 59

原算法的作者为 Peter Hellinger  24. May 2000

---------------------------------------

'John' and 'John' = 100%

'John' and 'Jon' = 75%

'Jim' and 'James' = 40%

"Luke Skywalker" and 'Darth Vader' = 0%

function StrSimilar (s1, s2: string): Integer;

var hit: Integer; // Number of identical chars

   p1, p2: Integer; // Position count

   l1, l2: Integer; // Length of strings

   pt: Integer; // for counter

   diff: Integer; // unsharp factor

   hstr: string; // help var for swapping strings

   // Array shows is position is already tested

   test: array [1..255] of Boolean;

begin

// Test Length and swap, if s1 is smaller

// we alway search along the longer string

if Length(s1) < Length(s2) then begin

 hstr:= s2;  s2:= s1;  s1:= hstr;

end;

// store length of strings to speed up the function

l1:= Length (s1);

l2:= Length (s2);

p1:= 1;  p2:= 1;  hit:= 0;

// calc the unsharp factor depending on the length

// of the strings.  Its about a third of the length

diff:= Max (l1, l2) div 3 + ABS (l1 - l2);

// init the test array

for pt:= 1 to l1 do test[pt]:= False;

// loop through the string

repeat

 // position tested?

 if not test[p1] then begin

  // found a matching character?

  if (s1[p1] = s2[p2]) and (ABS(p1-p2) <= diff) then begin

   test[p1]:= True;

   Inc (hit); // increment the hit count

   // next positions

   Inc (p1); Inc (p2);

   if p1 > l1 then p1:= 1;

  end else begin

   // Set test array

   test[p1]:= False;

   Inc (p1);

   // Loop back to next test position if end of the string

   if p1 > l1 then begin

    while (p1 > 1) and not (test[p1]) do Dec (p1);

    Inc (p2)

   end;

  end;

 end else begin

  Inc (p1);

  // Loop back to next test position if end of string

  if p1 > l1 then begin

   repeat Dec (p1); until (p1 = 1) or test[p1];

   Inc (p2);

  end;

 end;

until p2 > Length(s2);

// calc procentual value

Result:= 100 * hit DIV l1;

end;