package TextIndex; $VERSION = '2.0'; use Exporter; use strict; our @ISA = qw(Exporter); our @EXPORT=qw(InitTextIndex IndexText SearchText); use SafeIO; # ------------------------------------ my $dbdir = 'textindexdb'; my $uid; my $mappings = { }; my $treelevel = 3; sub InitTextIndex { my($dir) = @_; defined($dir) and $dbdir = $dir; (-d $dbdir) or mkdir $dbdir; } sub NewUID { ObtainLock($dbdir . '/id'); my $id = ReadFile( $dbdir.'/uid', { 'uid' => 0 } ); $id->{'uid'} = $id->{'uid'} + 1; WriteFile( $dbdir.'/uid', $id ); ReleaseLock($dbdir . '/id'); return '_' . $id->{'uid'}; } sub GetMappingsKeyLocation { my($key) = @_; if(length($key)<4) { $key = $key . ('X' x (4 - length($key)) ); } $key =~ s/^_+//g; my $t = Transform($key); (defined($t) and ($#$t > -1)) or return($dbdir); $t = $t->[0]; if(length($t)<$treelevel) { $t = ('_' x ($treelevel - length($t)) ) . $t; } return doTraverse($t, 1); } sub EnsureMappings { my($keylocation, $force) = @_; if($force || (!defined($mappings->{$keylocation})) ) { $mappings->{$keylocation} = ReadFile( $keylocation . '/-mappings', { 'byuserkey' => { }, 'byuid' => { } } ); } } sub KeyToUID { my ($key, $ensure) = @_; my $uid = $key; $uid =~ s/([^a-zA-Z0-9_])/sprintf("-%02x.", ord($1))/ge; $uid = '_' . $uid; return $uid; } sub UIDToKey { my ($uid) = @_; my $key = $uid; $key =~ s/^_//; $key =~ s/-([0-9a-fA-F][0-9a-fA-F])./chr(hex($1))/ge; return $key; } # ------------------------------------ sub doTraverse { my($key, $ensure) = @_; defined($key) or return undef; $key =~ /^(.{0,$treelevel})(.*?)$/ or return undef; my @dir = (split('', $1), $2); my $c = $dbdir; foreach my $part(@dir) { $c = $c . '/' . $part; (-d $c) or ($ensure and mkdir $c) or return undef; } return $c; } sub WriteEntry { my($key, $data) = @_; my $target = doTraverse($key, 1); defined($target) or return undef; $target = $target . '/data.entry'; WriteFile($target, $data); return 1; } sub ReadEntry { my($key, $default) = @_; my $target = doTraverse($key); defined($target) or return $default; $target = $target . '/data.entry'; return ReadFile($target, $default); } sub DeleteEntry { my($key) = @_; my $target = doTraverse($key); defined($target) or return 0; $target = $target . '/data.entry'; -f $target or return 0; return unlink $target; } sub Transform { my($text) = @_; $text =~ s/[^A-Za-z0-9]+/ /g; my @keys = sort(split(' ', uc $text)); my $result = [ ]; my $first = ''; while($#keys > -1) { my $first = shift @keys; (length($first)>2) and (length($first)<16) and push @$result, $first; while(($#keys > -1) && ($first eq $keys[0])) { shift @keys; } } return $result; } # ------------------------------------ sub Exists { my ($set, $item) = @_; my $low = 0; my $high = $#$set; while($high > $low) { my $mid = int(($high + $low) / 2); my $r = $set->[$mid] cmp $item; if ($r < 0) { $low = $mid + 1; } elsif ($r > 0) { $high = $mid - 1; } else { return 1; } } return (($high > -1) && ($set->[$high] eq $item)); } sub Intersect { my ($a, $b) = @_; my $aonly = [ ]; my $bonly = [ ]; my $both = [ ]; my $merged = [ ]; while(($#$a > -1) && ($#$b > -1)) { my $r = $a->[0] cmp $b->[0]; if($r < 0) { my $item = shift @$a; push(@$aonly, $item); push(@$merged, $item); } elsif($r > 0) { my $item = shift @$b; push(@$bonly, $item); push(@$merged, $item); } else { my $item = shift @$a; push(@$both, $item); push(@$merged, $item); shift @$b; } } if($#$a > -1) { push @$aonly, @$a; push @$merged, @$a; } if($#$b > -1) { push @$bonly, @$b; push @$merged, @$b; } return ($aonly, $bonly, $both, $merged); } # ------------------------------------ sub IndexTerm { my($term, $uid) = @_; ObtainLock($dbdir . '/__' . $term); my $entry = ReadEntry($term, [ ]); if(!Exists($entry, $uid)) { my @r = Intersect($entry, [$uid]); WriteEntry($term, $r[3]); } ReleaseLock($dbdir . '/__' . $term); } sub UnindexTerm { my($term, $uid) = @_; ObtainLock($dbdir . '/__' . $term); my $entry = ReadEntry($term, [ ]); if(Exists($entry, $uid)) { my @r = Intersect($entry, [$uid]); WriteEntry($term, $r[0]); } ReleaseLock($dbdir . '/__' . $term); } sub IndexText { my($text, $userkey) = @_; my $uid = KeyToUID($userkey, 1); my $entry = { 'terms' => [] }; $entry = ReadEntry($uid, $entry); my $oldterms = $entry->{'terms'}; my $newterms = Transform($text); { my @diffs = Intersect($oldterms, Transform($text)); { foreach my $term(@{$diffs[0]}) { UnindexTerm($term, $uid); } } { foreach my $term(@{$diffs[1]}) { IndexTerm($term, $uid); } } } $entry->{'userkey'} = $userkey; $entry->{'terms'} = $newterms; WriteEntry($uid, $entry); return $uid; } sub SearchText { my($query, $type, $verbose) = @_; my $terms = Transform($query); ($#$terms > -1) or return ([ ], [ ]); $verbose and print("Query terms: " . join(', ', @$terms) . "
\n"); my $any = [ ]; my $all = ReadEntry($terms->[0], [ ]); foreach my $term(@$terms) { my $uids = ReadEntry($term, [ ]); my $uids2 = [ @$uids ]; if(!defined($type) || $type eq 'all') { my @r = Intersect($uids, $all); $all = $r[2]; } if(!defined($type) || $type eq 'any') { my @r = Intersect($uids2, $any); $any = $r[3]; } } if(!defined($type) || $type eq 'any') { my $r = [ ]; foreach my $item(@$any) { defined(UIDToKey($item)) and push @$r, UIDToKey($item); } $any = $r; } if(!defined($type) || $type eq 'all') { my $r = [ ]; foreach my $item(@$all) { defined(UIDToKey($item)) and push @$r, UIDToKey($item); } $all = $r; } return($any, $all); } # ------------------------------------ 1;