Incorrect url specified

#!/usr/local/bin/perl # ##### print_hit_bold.pl ###### # expects a URL of the form: # http://your.www.server/cgibindir/print_hit_bold.pl/path/to/file?terms+to+hilite#first_hit # where the "path/to/file" is relative to your document root directory # NOTE: This routine uses the Environment Variable 'PATH_TRANSLATED' to # indicate the fully qualified path to the file that the user wishes to see # highlighted. If there are no terms following a '?' in the url, then the # 'Location:' directive will be used to return the original file. # The only files that can be accessed thru this script are those which can be # reached thru the Docs tree of your server. ################### # VERY IMPORTANT ################### # This routine READS the file directly -- it does not go thru the Web server # to obtain the file. Thus if you have directories in your main Web docs # tree that are restricted by ip/domain address or user authentication, then # either don't install this script, or modify it to not allow it to access # any files in such directories. # EXAMPLE if you want to exclude directories/files -- Uncomment the following # line and set its value appropriately # $DO_NOT_ACCESS = 'directory1/|directory2/|directory3/'; # print_hit_bold.pl -- Display documents selected by a WAIS search interface. # This attempts to "BOLD" every string which matches one of the search # words. It skips over anything within <>'s so that it hopefully doesn't # screw up any html stuff. It also skips over "non-formatted" sections # (LISTING, XMP, PLAINTEXT) and over the TITLE section. # It presents text files within

...

tags. # Michael A. Grady, UIUC/CCSO June 1994 (m-grady@uiuc.edu) # Mike Grady # modified on 7/12/94 to fix a bug with the "first_hit" anchor when the first # term that gets bolded is the "text" for a link. Small change on 7/20/94 to # exempt TITLE field from being bolded. Change on 7/21/94 to have file read # work in line mode rather than paragraph mode -- paragraph mode appears to # be "dangerous" with lots of the WWW type files (few blank lines in # documents). Change on 8/15/94 to add BASE tag to front so relative URLs from # this document DO NOT go back thru print_hit_bold. # Change on 01/18/95 to fix occasional problem with placing of "first_hit" # anchor. Added a "buffer" (variable $buffer_it) to hold the anchor/link text # until we are sure we don't need to place the 'name="first_hit"' into it. # 09/19/95 -- Added some security comments to the top of the script, and some # code just in case a given WWW server allows a '..' in the additional # PATH_INFO part of a url. Also put in place code that can be used to restrict # this script from reading files in a particular directory. # Specify the url for this WWW server -- same as in "kidofwais.pl" script # (this is the same string you added when you waisindexed (using -t url)) $serverURL = "http://www.grmi.org/"; # For DEBUG purposes, specify the fully-qualified name of a file in the # logging directory for your server. Then if you turn on the $DEBUG flag, # all the lines that this script sends to the HTTPD server will instead be # sent into this logfile, except for the beginning "Content-type" lines and # the pointer to the original document before it has been mucked with by this # script. The match structure built by "prepare_matching_program()" is also # written into the log (actually will precede the "real file"). $debugLOG = "/local/www/wais/cgi_log"; $DEBUG = 0; # set to 1 to turn on debugging code # Specify the "first hit anchor" to be used with hiliting, # should match up with anchor value in "kidofwais.pl". See &hilite. # Also need to add to end of hit to end anchor. $anchor = ' name="first_hit"'; # who maintains this service? $maintainer = 'WebMaster'; # you shouldn't have to edit anything below this line sub send_file { print "Location: /$partial_name\n\n"; } sub prepare_matching_program { # prepare the query word matching patterns (will use eval on this to # hopefully make it faster). Creates "$match_terms" and initializes # "$looking_for_first" to "on" (=1). This sets up the terms that we will # search and substitute for, adding BOLD tags around. The bold tags we # add are ... . was chosen (with cap and space between B # and >) so as to be unlikely to be the same as any of the original BOLD # tags in the document. This enables us, on the "first match", to find # the position in the string of the "first match" and add the anchor tag # in front of that (so the document is positioned to the first hit when # presented to the user). local($searchterm); $looking_for_first = 1; $match_terms = "{study;\n"; foreach $searchterm (@query) { # remove parens, quotes, words: and or not $searchterm =~ tr/\'\"//d; $searchterm =~ /^and$/i && next; $searchterm =~ /^or$/i && next; $searchterm =~ /^not$/i && next; $searchterm =~ /^\s*$/ && next; $searchterm =~ /^.+\*$/ && $searchterm =~ tr/*//d && ($match_terms .= "\$matched = 1 if s!\\b($searchterm)!\$1!ig;\n") && next; #default case $searchterm =~ tr/*//d; $match_terms .= "\$matched = 1 if s!\\b($searchterm)\\b!\$1!ig;\n"; } #end foreach searchterm $match_terms .= "}"; $DEBUG && print LOG "$match_terms\n"; return; } sub hilite { # Search for the query words and place BOLD tags around each if found. # If the "first hit" in this document, put in the "first_hit" anchor. # Also need to add to end of hit to end anchor. local($_) = @_; # text to hilite local($position,$end_position); $matched = 0; eval ($match_terms); # do the matching/replacing if ($looking_for_first && $matched) { # something matched for the 1st $looking_for_first = 0; # time, try to place an anchor $position = index( $_, "" ); # put anchor text into string if above found if ($position >= 0) { # If text that has been bolded first is already within a # .., construct, then we can't put another such # construct around it. So instead stick the "name=" string # into the construct that is already there. If it # already has a name= clause, not sure which will be # recognized. if ($buffer_it && $in_anchor) { $position = index( $buffer_it, ">" ); substr($buffer_it, $position, 0) = $anchor; } elsif (! $in_anchor) { substr($_, $position, 0) = ""; $end_position = index( $_, "", $position); ($end_position >= 0) && (substr($_, $end_position + 5, 0) = ""); } } } &print_it ( $_ ); return; } sub print_it { # This routine does the actual "printing" of the document # If we are in an anchor string, and have not yet found the # "first_hit", then save the text to be printed in a buffer. This will # allow us to put the 'name="first_hit"' into the anchor/link # construct itself if the first hit happens to be in the text that the # anchor/link is "around". local($to_print) = @_; if ($in_anchor && $looking_for_first) { $buffer_it .= $to_print; } else { print $buffer_it if $buffer_it; print $to_print; $buffer_it = ''; } } sub process_file { # if 'PATH_INFO'/'PATH_TRANSLATED' has a non-null value, then # 'PATH_TRANSLATED' should be the fully qualified path to the file to # to be hilited. If these environment variables are null, then someone # has constructed an incorrect url $partial_name = $ENV{'PATH_INFO'}; $full_name = $ENV{'PATH_TRANSLATED'}; # The servers I'm familiar with don't allow a '..' in the 'PATH_INFO', # but just in case some server does, check for it. # Also make sure the file requested is NOT in a restricted directory. if (($partial_name eq "") || ($partial_name =~ /\.\./) || ($DO_NOT_ACCESS && ($partial_name =~ m"$DO_NOT_ACCESS"))) { print "Content-type: text/html\n\n"; print "Incorrect url specified\n"; print "
Incorrect url specified
\n"; print "
"; print "This script has been referenced by an incorrect url. Please "; print "contact $maintainer if you have any questions.
\n"; print "Main page for this server.\n"; return; } $partial_name = substr($partial_name,1); # remove beginning slash TYPE: for ($partial_name) { /.html$/ && do { $type = 'html'; $ok = 1; last TYPE }; /.txt$/ && do { $type = 'text'; $ok = 1; last TYPE }; /.TEXT$/ && do { $type = 'text'; $ok = 1; last TYPE }; $ok = 0; } do { &send_file; return; } unless defined @ARGV; # nothing to hilite # probably can't try to hilite without messing up file unless html/txt do { &send_file; return; } unless $ok; local(@query) = @ARGV; local($pquery) = join(" ", @query); # NCSA's HTTPD puts backslashes in front of "funny" or "dangerous" # characters in the input supplied thru argv. In the case of search terms # for WAIS, this can screw up the search (parens and "*" get backslashed # and then don't work correctly). So remove the backslashes, AND the # potentially "dangerous" characters ( ; ` ! ). $pquery =~ tr/!\;\`\\//d; # just in case, get rid of ;`! and \ @query = split(' ',$pquery); # and recreate query word array # DEBUG code - write stuff into file $DEBUG && do { open (LOG, ">>$debugLOG") || die "can't open log";}; &prepare_matching_program; # create $match_terms string # just send file if there are no words left to hilite do { &send_file; return; } if ($match_terms eq "{study;\n}"); # start the html document to "return" print "Content-type: text/html\n\n"; print "\n"; print "Get the original (unBOLDed) document.
\n"; $DEBUG && select (LOG); # write rest of stuff to file for DEBUG # try and open file for reading open (FP, "$full_name") || print "File $full_name can't be read. Please contact $maintainer." && return; if ($type eq 'text') { # Start document for "text" print "File: $partial_name\n"; print "
\n
"; } local($in_tag,$skip_till_endtag,$endtag,$line_left); $in_tag = 0; $skip_till_endtag = $in_anchor = 0; $endtag = $buffer_it = ''; READ_LINE: while () { $line_left = $_; PROCESS_LINE: for ($line_left) { ($line_left eq "") && next READ_LINE; # nothing left of this # paragraph $skip_till_endtag && $line_left =~ /$endtag/i && do { $line_left = $'; &print_it ( "$`$endtag" ); $skip_till_endtag = 0; $endtag = ""; redo PROCESS_LINE; }; $skip_till_endtag && do { # endtag must be in next paragraph &print_it ( $line_left ); next READ_LINE; }; ! $in_tag && $line_left =~ // && do { $line_left = $'; &print_it ( "$`>" ); $in_tag = 0; redo PROCESS_LINE; }; $in_tag && do { # ending ">" must be in next paragraph &print_it ( $line_left ); next READ_LINE; }; &hilite($line_left); # default case: hilite rest of paragraph next READ_LINE; } # end line_left foreach } print $buffer_it if $buffer_it; # print out "leftover stuff" if any close(FP); print "\n
\n\n" if ($type eq 'text'); # end document for "text" return; } open (STDERR,"> /dev/null"); eval '&process_file';