#!/usr/bin/perl -T use strict; use CGI qw(:standard -nosticky); use CGI::Carp qw(fatalsToBrowser); use DBI; use Reviews; $CGI::POST_MAX=1024 * 15; # max 20K posts $CGI::DISABLE_UPLOADS = 1; # no uploads $ENV{PATH} = ""; my $q = new CGI; print $q->header(-expires=>'now'); print $q->start_html(); my $begin_time = time(); my $dbh = make_db_handle; # ok, first...if we've seen this url before, just spit back what # we returned last time. # if we haven't seen it before, we need to cache our results at # the end. my $out; my $url = $q->url(-relative=>1, -query=>1); my $html = dbquery($dbh, qq/ SELECT html FROM htmlcache WHERE url = "$url" /); if ($html) { #print "

READING FROM CACHE...

"; print $html; exit; } # how many words before? how many words after? my $before = $q->param('before'); if (! defined $before) { $before = 2 } my $after = $q->param('after'); if (! defined $after) { $after = 2 } # get words from arguments (separated by space) my $id = $q->param('id') || 1; $id =~ s/[^\d ]//g; #my $restrict= $q->param('restrict'); #my $restrict_line = " "; #if ($restrict eq "positive") { # $restrict_line = "AND rating_id > 74"; #} elsif ($restrict eq "negative") { # $restrict_line = "AND rating_id < 50"; #} my @word_ids = split(/ /, $id); my @sentence_ids; $out .= "

words that appear next to:
"; my %seen_words; # for each word, print words at the top, find word_sentence.position, # and find other words with positions near it... my $stuff_to_print; my %word_id_to_word; my %word_id_to_sentence_and_position; foreach my $word_id (@word_ids) { $word_id = substr($word_id, 0, 10); #print words at the top my $word = dbquery($dbh, qq/ SELECT word FROM word WHERE id = $word_id LIMIT 1 /); $stuff_to_print .= "$word, "; $seen_words{$word} = 1; $word_id_to_word{$word_id} = $word; # find word sentences and positions my $sentence_and_position = dbquery($dbh, qq/ SELECT sentence_id,position FROM word_sentence WHERE word_id = $word_id /); foreach my $s_and_p (@$sentence_and_position) { push @{$word_id_to_sentence_and_position{$word_id}}, $s_and_p; } } $stuff_to_print =~ s/, $//; $out .= "$stuff_to_print

"; # give a form for people to decide how many words they're looking at $out .= $q->start_form(-method=>'GET') . "words before:" . $q->textfield(-name=>'before', -default=>$before, -size=>2) . "words after:" . $q->textfield(-name=>'after', -default=>$after, -size=>2) . $q->hidden(-name=>'id', -default=>$id) . " ".$q->submit(-value=>'Submit') . $q->end_form . "\n"; # next, find other words near the positions we collected my %word_counts; my %seen_sentence; my $seen_sentence_count; foreach my $word_id (keys %word_id_to_sentence_and_position) { # look at each sentence/position pair foreach my $s_and_p_pair (@{$word_id_to_sentence_and_position{$word_id}}) { my $sentence_id = $$s_and_p_pair[0]; push (@sentence_ids, $sentence_id); next if $seen_sentence{$sentence_id}; my $position = $$s_and_p_pair[1]; my $begin = $position - $before; my $end = $position + $after; foreach my $current_position ($begin .. $end) { next if ($current_position == $position); #print "current position is $current_position
"; # count these words... my $new_word = dbquery($dbh, qq/ SELECT word_id,word FROM word_sentence,word WHERE sentence_id = $sentence_id AND position = $current_position AND word_id = word.id LIMIT 1 /); next unless $new_word; my $new_word_id = $$new_word[0]; my $new_word_text = $$new_word[1]; next if ($seen_words{$new_word_text}); $word_counts{$new_word_id}++; $word_id_to_word{$new_word_id} = $new_word_text; } $seen_sentence{$sentence_id} = 1; $seen_sentence_count++; } } # if we only have one word, there is an error...this is the workaround #if (! ref $$other_words[0]) { $other_words = [ $other_words]; } $out .= "
    \n"; # print 'em my $count; foreach my $other_word_id (sort { $word_counts{$b} <=> $word_counts{$a} } keys %word_counts) { # next if we have more than one word, but we only saw this once. #next if (($word_counts{$other_word_id} == 1) && ($#word_ids > 0)); # next if we only saw this word once #next if ($word_counts{$other_word_id} == 1); next if (length $word_id_to_word{$other_word_id} < 3); next if ($word_id_to_word{$other_word_id} eq "the"); next if ($word_id_to_word{$other_word_id} eq "The"); next if ($word_id_to_word{$other_word_id} eq "and"); $count++; $out .= qq%
  1. $word_id_to_word{$other_word_id}: $word_counts{$other_word_id}%; # print a more advanced joint KWIC here, one for each word... $out .= " [joint KWIC with: "; foreach my $word_id (@word_ids) { $out .= qq%$word_id_to_word{$word_id} \n%; } $out .= "]"; last if $count == 500; } $out .= "
"; my $end_time = time(); my $time_elapsed = $end_time - $begin_time; $out .= "\n

query time: $time_elapsed seconds\n"; my $total_sentences = $#sentence_ids + 1; $out .= "
total sentences: $total_sentences, sentences after pruning: $seen_sentence_count\n"; $out .= $q->end_html(); print $out; # now put this text in the database... $url =~ s/"/\\"/g; $out =~ s/"/\\"/g; dbquery($dbh, qq/ INSERT INTO htmlcache (url,html) VALUES ("$url","$out") /);