#!/usr/bin/perl use strict; use Search; local $| = 1; # Do not buffer output (localized for mod_perl) # Configuration/constant variables: use vars qw(@RcDays @HtmlPairs @HtmlSingle $TempDir $LockDir $DataDir $HtmlDir $UserDir $KeepDir $PageDir $InterFile $RcFile $RcOldFile $IndexFile $FullUrl $SiteName $HomePage $LogoUrl $RcDefault $IndentLimit $RecentTop $EditAllowed $UseDiff $UseSubpage $UseCache $RawHtml $SimpleLinks $NonEnglish $LogoLeft $KeepDays $HtmlTags $HtmlLinks $UseDiffLog $KeepMajor $KeepAuthor $FreeUpper $EmailNotify $SendMail $EmailFrom $FastGlob $EmbedWiki $ScriptTZ $BracketText $UseAmPm $UseConfig $UseIndex $UseLookup $RedirType $AdminPass $EditPass $UseHeadings $NetworkFile $BracketWiki $FreeLinks $WikiLinks $AdminDelete $FreeLinkPattern $RCName $RunCGI $ShowEdits $ThinLine $LinkPattern $InterLinkPattern $InterSitePattern $UrlProtocols $UrlPattern $ImageExtensions $RFCPattern $ISBNPattern $PostcodePattern $LatLongPattern $RevisionPattern $PFLinkPattern $FS $FS1 $FS2 $FS3 $CookieName $SiteBase $StyleSheet $NotFoundPg $FooterNote $EditNote $MaxPost $NewText $NotifyDefault $HttpCharset $UserGotoBar $UseBuiltinDiff); # Note: $NotifyDefault is kept because it was a config variable in 0.90 # Other global variables: use vars qw(%Page %Section %Text %InterSite %SaveUrl %SaveNumUrl %KeptRevisions %UserCookie %SetCookie %UserData %IndexHash %Translate %LinkIndex $InterSiteInit $SaveUrlIndex $SaveNumUrlIndex $MainPage $OpenPageName @KeptList @IndexList $IndexInit $q $Now $UserID $TimeZoneOffset $ScriptName $BrowseCode $OtherCode); # == Configuration ===================================================== $DataDir = "data"; # Main wiki directory $UseConfig = 0; # 1 = use config file, 0 = do not look for config # Default configuration (used if UseConfig is 0) $CookieName = "ToothyWiki"; # Name for this wiki (for multi-wiki sites) $SiteName = "ToothyWiki"; # Name of site (used for titles) $HomePage = "ToothyWiki"; # Home page (change space to _) $RCName = "RecentChanges"; # Name of changes page (change space to _) $LogoUrl = "/img/text_logo.jpg"; # URL for site logo ("" for no logo) $ENV{PATH} = "/usr/bin/"; # Path used to find "diff" $ScriptTZ = ""; # Local time zone ("" means do not print) $RcDefault = 7; # Default number of RecentChanges days @RcDays = qw(1 3 7 30 90); # Days for links on RecentChanges $KeepDays = 365; # Days to keep old revisions $SiteBase = ""; # Full URL for header $FullUrl = ""; # Set if the auto-detected URL is wrong $RedirType = 1; # 1 = CGI.pm, 2 = script, 3 = no redirect $EditPass = ""; # Like AdminPass, but for editing only $StyleSheet = ""; # URL for CSS stylesheet (like "/wiki.css") $NotFoundPg = ""; # Page for not-found links ("" for blank pg) $EmailFrom = "ToothyWiki"; # Text for "From: " field of email notes. $SendMail = "/usr/sbin/sendmail"; # Full path to sendmail executable $FooterNote = ""; # HTML for bottom of every page $EditNote = ""; # HTML notice above buttons on edit page $MaxPost = 1024 * 210; # Maximum 210K posts (about 200K for pages) $NewText = ""; # New page text ("" for default message) $HttpCharset = ""; # Charset for pages, like "iso-8859-2" $UserGotoBar = ""; # HTML added to end of goto bar # Major options: $UseSubpage = 1; # 1 = use subpages, 0 = do not use subpages $UseCache = 0; # 1 = cache HTML pages, 0 = generate every page $EditAllowed = 1; # 1 = editing allowed, 0 = read-only $RawHtml = 0; # 1 = allow tag, 0 = no raw HTML in pages $HtmlTags = 0; # 1 = "unsafe" HTML tags, 0 = only minimal tags $UseDiff = 1; # 1 = use diff features, 0 = do not use diff $FreeLinks = 1; # 1 = use [[word]] links, 0 = LinkPattern only $WikiLinks = 1; # 1 = use LinkPattern, 0 = use [[word]] only $AdminDelete = 1; # 1 = Admin only page, 0 = Editor can delete pages $RunCGI = 1; # 1 = Run script as CGI, 0 = Load but do not run $EmailNotify = 0; # 1 = use email notices, 0 = no email on changes $EmbedWiki = 0; # 1 = no headers/footers, 0 = normal wiki pages $UseBuiltinDiff = 0; # 1 = use built-in diff, 0 = use external diff # Minor options: $LogoLeft = 0; # 1 = logo on left, 0 = logo on right $RecentTop = 1; # 1 = recent on top, 0 = recent on bottom $UseDiffLog = 1; # 1 = save diffs to log, 0 = do not save diffs $KeepMajor = 1; # 1 = keep major rev, 0 = expire all revisions $KeepAuthor = 1; # 1 = keep author rev, 0 = expire all revisions $ShowEdits = 0; # 1 = show minor edits, 0 = hide edits by default $HtmlLinks = 0; # 1 = allow A HREF links, 0 = no raw HTML links $SimpleLinks = 0; # 1 = only letters, 0 = allow _ and numbers $NonEnglish = 0; # 1 = extra link chars, 0 = only A-Za-z chars $ThinLine = 0; # 1 = fancy
tags, 0 = classic wiki
$BracketText = 1; # 1 = allow [URL text], 0 = no link descriptions $UseAmPm = 1; # 1 = use am/pm in times, 0 = use 24-hour times $UseIndex = 0; # 1 = use index file, 0 = slow/reliable method $UseHeadings = 1; # 1 = allow = h1 text =, 0 = no header formatting $NetworkFile = 0; # 1 = allow remote file:, 0 = no file:// links $BracketWiki = 0; # 1 = [WikiLnk txt] link, 0 = no local descriptions $UseLookup = 1; # 1 = lookup host names, 0 = skip lookup (IP only) $FreeUpper = 1; # 1 = force upper case, 0 = do not force case $FastGlob = 1; # 1 = new faster code, 0 = old compatible code # HTML tag lists, enabled if $HtmlTags is set. # Scripting is currently possible with these tags, # so they are *not* particularly "safe". # Tags that must be in ... pairs: @HtmlPairs = qw(b i u font big small sub sup h1 h2 h3 h4 h5 h6 cite code em s strike strong tt var div center blockquote ol ul dl table caption); # Single tags (that do not require a closing /tag) @HtmlSingle = qw(br p hr li dt dd tr td th); @HtmlPairs = (@HtmlPairs, @HtmlSingle); # All singles can also be pairs # == You should not have to change anything below this line. ============= $IndentLimit = 20; # Maximum depth of nested lists $PageDir = "$DataDir/page"; # Stores page data $HtmlDir = "$DataDir/html"; # Stores HTML versions $UserDir = "$DataDir/user"; # Stores user data $KeepDir = "$DataDir/keep"; # Stores kept (old) page data $TempDir = "$DataDir/temp"; # Temporary files and locks $LockDir = "$TempDir/lock"; # DB is locked if this exists $InterFile = "$DataDir/intermap"; # Interwiki site->url map $RcFile = "$DataDir/rclog"; # New RecentChanges logfile $RcOldFile = "$DataDir/rclog.old"; # Old RecentChanges logfile $IndexFile = "$DataDir/pageidx"; # List of all pages # --------------------------------- sub InitLinkPatterns { my ($UpperLetter, $LowerLetter, $AnyLetter, $LpA, $LpB, $QDelim); # Field separators are used in the URL-style patterns below. $FS = "\xb3"; # The FS character is a superscript "3" $FS1 = $FS . "1"; # The FS values are used to separate fields $FS2 = $FS . "2"; # in stored hashtables and other data structures. $FS3 = $FS . "3"; # The FS character is not allowed in user data. $UpperLetter = "[A-Z"; $LowerLetter = "[a-z"; $AnyLetter = "[A-Za-z"; if ($NonEnglish) { $UpperLetter .= "\xc0-\xde"; $LowerLetter .= "\xdf-\xff"; $AnyLetter .= "\xc0-\xff"; } if (!$SimpleLinks) { $AnyLetter .= "_0-9"; } $UpperLetter .= "]"; $LowerLetter .= "]"; $AnyLetter .= "]"; # Main link pattern: lowercase between uppercase, then anything $LpA = $UpperLetter . "+" . $LowerLetter . "+" . $UpperLetter . $AnyLetter . "*"; # Optional subpage link pattern: uppercase, lowercase, then anything $LpB = $UpperLetter . "+" . $LowerLetter . "+" . $AnyLetter . "*"; # #search_in_page my $LpC = "#[a-zA-Z0-9_]+"; if ($UseSubpage) { # Loose pattern: If subpage is used, subpage may be simple name $LinkPattern = "((?:(?:(?:$LpA)?\\/$LpB)|$LpA)(?:$LpC)?)"; # Strict pattern: both sides must be the main LinkPattern # $LinkPattern = "((?:(?:$LpA)?\\/)?$LpA)"; } else { $LinkPattern = "($LpA)"; } $QDelim = '(?:"")?'; # Optional quote delimiter (not in output) $LinkPattern .= $QDelim; # Inter-site convention: sites must start with uppercase letter # (Uppercase letter avoids confusion with URLs) $InterSitePattern = $UpperLetter . $AnyLetter . "+"; $InterLinkPattern = "((?:$InterSitePattern:[^\\]\\s\"<>$FS]+)$QDelim)"; if ($FreeLinks) { # Note: the - character must be first in $AnyLetter definition if ($NonEnglish) { $AnyLetter = "[-,.()' _0-9A-Za-z\xc0-\xff]"; } else { $AnyLetter = "[-,.()' _0-9A-Za-z]"; } } $FreeLinkPattern = "($AnyLetter+)"; if ($UseSubpage) { $FreeLinkPattern = "((?:(?:$AnyLetter+)?\\/)?$AnyLetter+)"; } $FreeLinkPattern .= $QDelim; # Url-style links are delimited by one of: # 1. Whitespace (kept in output) # 2. Left or right angle-bracket (< or >) (kept in output) # 3. Right square-bracket (]) (kept in output) # 4. A single double-quote (") (kept in output) # 5. A $FS (field separator) character (kept in output) # 6. A double double-quote ("") (removed from output) $UrlProtocols = "http|https|ftp|afs|news|nntp|mid|cid|mailto|wais|" . "prospero|telnet|gopher"; $UrlProtocols .= '|file' if $NetworkFile; $UrlPattern = "((?:(?:$UrlProtocols):[^\\]\\s\"<>$FS]+)$QDelim)"; $ImageExtensions = "(gif|jpg|png|bmp|jpeg)"; $RFCPattern = "RFC\\s?(\\d+)"; $ISBNPattern = "(?:ISBN|ASIN):?((?: ?[A-Za-z0-9-]){10})"; $PFLinkPattern = "PFLink:([A-Za-z0-9_]+)"; $PostcodePattern = '\b([A-Z][A-Z]?[0-9][0-9]? [0-9][A-Z][A-Z])\b'; $LatLongPattern = 'OS grid +(?:reference )? *(?:\()?(X?[:=]?[0-9]+)[, ] *(Y?[:=]?[0-9]+(?:\))?)'; $RevisionPattern ='Revision:([0-9]+)'; } sub ReIndex { my ($name, $link); my (@links, @newlinks, @pglist, %pgExists); %pgExists = (); @pglist = &AllPagesList(); foreach $name (@pglist) { $pgExists{$name} = 1; } foreach $name (@pglist) { my $basename = $name; $basename =~ s/^(.*)?\/.*$/$1/; @newlinks = (); @links = &GetPageLinks($name); foreach $link (@links) { $link =~ s/^\//$basename\//; print $name."->".$link."\n"; push(@newlinks, $link); } @links = @newlinks; @links = sort(@links); Update ("searchdb/", $name, \@newlinks); } } sub SplitUrlPunct { my ($url) = @_; my ($punct); if ($url =~ s/\"\"$//) { return ($url, ""); # Delete double-quote delimiters here } $punct = ""; ($punct) = ($url =~ /([^a-zA-Z0-9\/\xc0-\xff]+)$/); $url =~ s/([^a-zA-Z0-9\/\xc0-\xff]+)$//; return ($url, $punct); } sub StripUrlPunct { my ($url) = @_; my ($junk); ($url, $junk) = &SplitUrlPunct($url); $url =~ s/\#.+//; return $url; } sub FreeToNormal { my ($id) = @_; $id =~ s/ /_/g; $id = ucfirst($id); if (index($id, '_') > -1) { # Quick check for any space/underscores $id =~ s/__+/_/g; $id =~ s/^_//; $id =~ s/_$//; if ($UseSubpage) { $id =~ s|_/|/|g; $id =~ s|/_|/|g; } } if ($FreeUpper) { # Note that letters after ' are *not* capitalized if ($id =~ m|[-_.,\(\)/][a-z]|) { # Quick check for non-canonical case $id =~ s|([-_.,\(\)/])([a-z])|$1 . uc($2)|ge; } } return $id; } sub ExtractLinks { my ($text) = @_; my @links = (); $text =~ s/((.|\n)*?)<\/html>/ /ig; $text =~ s/(.|\n)*?\<\/nowiki>/ /ig; $text =~ s/
(.|\n)*?\<\/pre>/ /ig;
  $text =~ s/(.|\n)*?\<\/code>/ /ig;

  {
      my $fl = $FreeLinkPattern;
      $text =~ s/\[ToothyWiki:$fl [^\]]+\]/push(@links, &FreeToNormal($1)), ' '/ge;
  }

  {
    $text =~ s/$InterLinkPattern/ /g;
  }

  {
    $text =~ s/$UrlPattern/ /g;
  }

  if ($FreeLinks) {
      my $fl = $FreeLinkPattern;
      $text =~ s/\[\[$fl\|[^\]]+\]\]/push(@links, &FreeToNormal($1)), ' '/ge;
      $text =~ s/\[\[$fl\]\]/push(@links, &FreeToNormal($1)), ' '/ge;
  }

  if ($WikiLinks) {
      $text =~ s/$LinkPattern/push(@links, &StripUrlPunct($1)), ' '/ge;
  }

  $text =~ s/ --([A-Z][A-Za-z0-9]*) /push(@links, &StripUrlPunct($1)), ' '/ge;

  return @links;
}

sub GetPageLinks {
  my ($name) = @_;

  &OpenPage($name);
  &OpenDefaultText();

  return &ExtractLinks($Text{'text'});
}

sub OpenNewSection {
  my ($name, $data) = @_;

  %Section = ();
  $Section{'name'} = $name;
  $Section{'version'} = 1;      # Data format version
  $Section{'revision'} = 0;     # Number of edited times
  $Section{'tscreate'} = $Now;  # Set once at creation
  $Section{'ts'} = $Now;        # Updated every edit
  $Section{'ip'} = $ENV{REMOTE_ADDR};
  $Section{'host'} = '';        # Updated only for real edits (can be slow)
  $Section{'id'} = $UserID;
  $Section{'username'} = &GetParam("username", "");
  $Section{'data'} = $data;
  $Page{$name} = join($FS2, %Section);  # Replace with save?
}

sub OpenNewText {
  my ($name) = @_;  # Name of text (usually "default")
  %Text = ();
  # Later consider translation of new-page message? (per-user difference?)
  if ($NewText ne '') {
    $Text{'text'} = T($NewText);
  } else {
    $Text{'text'} = T('Describe the new page here.') . "\n";
  }
  $Text{'text'} .= "\n"  if (substr($Text{'text'}, -1, 1) ne "\n");
  $Text{'minor'} = 0;      # Default as major edit
  $Text{'newauthor'} = 1;  # Default as new author
  $Text{'summary'} = '';
  &OpenNewSection("text_$name", join($FS3, %Text));
}

sub OpenSection {
  my ($name) = @_;

  if (!defined($Page{$name})) {
    &OpenNewSection($name, "");
  } else {
    %Section = split(/$FS2/, $Page{$name}, -1);
  }
}

sub OpenText {
  my ($name) = @_;

  if (!defined($Page{"text_$name"})) {
    &OpenNewText($name);
  } else {
    &OpenSection("text_$name");
    %Text = split(/$FS3/, $Section{'data'}, -1);
  }
}

sub OpenDefaultText {
  &OpenText('default');
}

sub GetPageDirectory {
  my ($id) = @_;

  if ($id =~ /^([a-zA-Z])/) {
    return uc($1);
  }
  return "other";
}

sub GetPageFile {
  my ($id) = @_;

  return $PageDir . "/" . &GetPageDirectory($id) . "/$id.db";
}

sub OpenPage {
  my ($id) = @_;
  my ($fname, $data);

  if ($OpenPageName eq $id) {
    return;
  }
  %Section = ();
  %Text = ();
  $fname = &GetPageFile($id);
  if (-f $fname) {
    $data = &ReadFileOrDie($fname);
    %Page = split(/$FS1/, $data, -1);  # -1 keeps trailing null fields
  } else {
    die ("$id doesn't exist!");
  }
  $OpenPageName = $id;
}

sub ReadFile {
  my ($fileName) = @_;
  my ($data);
  local $/ = undef;   # Read complete files

  if (open(IN, "<$fileName")) {
    $data=;
    close IN;
    return (1, $data);
  }
  return (0, "");
}

sub ReadFileOrDie {
  my ($fileName) = @_;
  my ($status, $data);

  ($status, $data) = &ReadFile($fileName);
  if (!$status) {
    die(Ts('Can not open %s', $fileName) . ": $!");
  }
  return $data;
}

sub GenerateAllPagesList {
  my (@pages, @dirs, $id, $dir, @pageFiles, @subpageFiles, $subId);

  @pages = ();
  if ($FastGlob) {
    # The following was inspired by the FastGlob code by Marc W. Mengel.
    # Thanks to Bob Showalter for pointing out the improvement.
    opendir(PAGELIST, $PageDir);
    @dirs = readdir(PAGELIST);
    closedir(PAGELIST);
    @dirs = sort(@dirs);
    foreach $dir (@dirs) {
      next  if (($dir eq '.') || ($dir eq '..'));
      opendir(PAGELIST, "$PageDir/$dir");
      @pageFiles = readdir(PAGELIST);
      closedir(PAGELIST);
      foreach $id (@pageFiles) {
        next  if (($id eq '.') || ($id eq '..'));
        if (substr($id, -3) eq '.db') {
          push(@pages, substr($id, 0, -3));
        } elsif (substr($id, -4) ne '.lck') {
          opendir(PAGELIST, "$PageDir/$dir/$id");
          @subpageFiles = readdir(PAGELIST);
          closedir(PAGELIST);
          foreach $subId (@subpageFiles) {
            if (substr($subId, -3) eq '.db') {
              push(@pages, "$id/" . substr($subId, 0, -3));
            }
          }
        }
      }
    }
  } else {
    # Old slow/compatible method.
    @dirs = qw(A B C D E F G H I J K L M N O P Q R S T U V W X Y Z other);
    foreach $dir (@dirs) {
      if (-e "$PageDir/$dir") {  # Thanks to Tim Holt
        while (<$PageDir/$dir/*.db $PageDir/$dir/*/*.db>) {
          s|^$PageDir/||;
          m|^[^/]+/(\S*).db|;
          $id = $1;
          push(@pages, $id);
        }
      }
    }
  }
  return sort(@pages);
}

sub AllPagesList {
  return &GenerateAllPagesList();
}

# ---------------------------------

&InitLinkPatterns();

if ($#ARGV < 1)
{
  die "Usage: searchtest.pl (fetch|update|delete) pagename [entries..]\n";
}

my $command = shift @ARGV;
my $pagename = shift @ARGV;

if ($command eq "reindex")
{
  &ReIndex();
}
elsif ($command eq "fetch")
{
  my $results = Fetch ("searchdb/", $pagename);
  foreach my $result (@$results)
  {
    print $result."\n";
  }
}
elsif ($command eq "update")
{
  Update ("searchdb/", $pagename, \@ARGV);
}
elsif ($command eq "delete")
{
  Delete ("searchdb/", $pagename);
}
else
{
  die "Usage: searchtest.pl (fetch|update|delete|reindex) pagename [entries..]\n";
}