package libgajaf; $version = "2.3"; $update = "20020126"; $expunged = 0; my($HASHBITS,$METALENG,$BVLENG); require './gajaf.conf'; # Length of Signeture per file. # if you change $HASHBITS, # you should to change &trHASH() to keep hash-function more suitable. $HASHBITS = 1024; $BVLENG = $HASHBITS / 8; $METALENG = 128; $RECLENG = $METALENG + $BVLENG + $BVLENG; sub readDBfile{ open(DBFILE,"$dbfile"); local($url,$char,$i); local($count) = 0; while(!(eof(DBFILE))){ read(DBFILE,$data,$RECLENG); $bitvecDB[$count] = [ unpack("A$METALENG a$BVLENG a$BVLENG",$data) ]; $count++; } close(DBFILE); $is_DBfile_read = 1; } sub openDB{ open(DBFILE,">>$dbfile"); $is_DBfile_open = 1; } sub closeDB{ close(DBFILE); $is_DBfile_open = 0; } sub addSignature{ my($url,$text) = @_; my(@HASH,$i); open(DBFILE,">>$dbfile") if($is_DBfile_open != 1); @HASH = &make_hash($text); syswrite(DBFILE,pack("a$METALENG a$BVLENG a$BVLENG",$url,$HASH[0],$HASH[1]),$RECLENG); close(DBFILE) if($is_DBfile_open != 1); if($is_DBfile_read == 1){ local($num) = scalar(@bitvecDB); $bitvecDB[$num][0] = $url; $bitvecDB[$num][1] = $HASH[0]; $bitvecDB[$num][2] = $HASH[1]; } } sub sigCheck{#return list of pointers. my($keywords) = @_; my($i,@KEY); my(@result); @KEY = make_hash($keywords); for($i = 0;$bitvecDB[$i][0] ne "";$i++){ if( ($KEY[0] & $bitvecDB[$i][1]) eq $KEY[0] && ($KEY[1] & $bitvecDB[$i][2]) eq $KEY[1] ){ push(@result,$bitvecDB[$i][0]); }else{ $expunged++; } } return @result; } sub make_hash{ my($key) = @_; my(%wordlist); &jcode::convert(\$key,'euc'); if($HTMLMODE eq "yes"){$key =~ s/<.*?>//g;} %wordlist = make_wordlist($key); return make_signature(%wordlist); } sub make_wordlist{ local($_) = @_; local($leng) = length($_); local(%wordlist); local($i,$wordtype) = (0,"parse"); local($char,$word); local($headbyte,$footbyte); while($i < $leng){ $char = substr($_,$i++,1); if($char =~ /[0-9a-zA-Z]/){# ASCII Alphabet charcter; RESOLVEWORD("ascii"); }elsif($char le "\xA0"){# ASCII word-parse character; RESOLVEWORD("parse"); }else{ $headbyte = $char; $footbyte = substr($_,$i++,1); $char = $headbyte.$footbyte; if($headbyte ge "\xB0"){# EUC Kanji character; RESOLVEWORD("kanji"); }elsif($headbyte eq "\xA4"){# EUC hirakana character; RESOLVEWORD("kana"); }elsif($headbyte eq "\xA5"){# EUC KATAKANA character; RESOLVEWORD("KANA"); }else{ if($headbyte eq "\xA1"){# EUC Other character; if($footbyte ge "\xB3"){ if($footbyte le "\xBA" || $footbyte eq "\xBC"){ #EUC supportive character; RESOLVEWORD("EUCsup"); next; } } if($footbyte lt "\xDC"){ RESOLVEWORD("other"); next; } } RESOLVEWORD("other"); } } } RESOLVEWORD("parse"); return %wordlist; } sub RESOLVEWORD{ local($_) = @_; if($_ eq 'kanji'){ $wordlist{$char} = $_; } if(/EUCsup/ && $wordtype =~ /(kanji|kana|KANA)/){ $_ = $wordtype; } if($_ eq $wordtype){# extend word; $word = $word.$char; }else{ if($wordtype eq "ascii" && length($word) >= $HASHLENG{$wordtype}){ my($leng) = length($word); $wordlist{$word} = $wordtype; $wordlist{substr($word,0,($leng-1))} = $wordtype; $wordlist{substr($word,1,($leng-1))} = $wordtype; } $word = $char;$wordtype = $_; return; } if(/kanji/){ if(length($word) >= 4){# 4 = 2 x 2; $wordlist{$word} = $wordtype; $word = substr($word,2); } } if($_ eq 'kana' || $_ eq 'KANA'){ if(length($word) >= $HASHLENG{$wordtype} * 2){ $wordlist{$word} = $wordtype; $word = substr($word,2); } } if($_ eq 'parse' || $_ eq 'other'){ if($wordtype eq "ascii" && leng($word) >= $HASHLENG{$wordtype}){ $wordlist{$word} = $wordtype; } $word = "";$wordtype = $_; } } sub make_signature{ my(%wordlist) = @_; my($i,@HASH); vec($HASH[0],$HASHBITS - 1,1) = 0; vec($HASH[1],$HASHBITS - 1,1) = 0; foreach $word (keys %wordlist){ addHash(\@HASH,$word) if($word ne ""); } return @HASH; } sub addHash{ my($word,@hash); (*HASH,$word) = @_; $hash[0] = ( modHASH($word,9973) ) % $HASHBITS ; $hash[1] = ( modHASH($word,10007) ) % $HASHBITS ; vec($HASH[0],$hash[0],1) = 1; vec($HASH[1],$hash[1],1) = 1; } sub modHASH{ my($word,$prime) = @_; my($i,$result); $i = 0; map {$i = $i * 256 + $_;} unpack("C*",$word); # can't use % because of overflow. $result = $i - int($i / $prime) * $prime; return $result; } 1;