#!/usr/bin/perl -T
use strict;
use CGI qw(:standard -nosticky);
use CGI::Carp qw(fatalsToBrowser);
use DBI;
use Reviews;
$CGI::POST_MAX=1024 * 15; # max 20K posts
$CGI::DISABLE_UPLOADS = 1; # no uploads
$ENV{PATH} = "";
my $q = new CGI;
print $q->header(-expires=>'now');
print $q->start_html();
my $begin_time = time();
my $dbh = make_db_handle;
# ok, first...if we've seen this url before, just spit back what
# we returned last time.
# if we haven't seen it before, we need to cache our results at
# the end.
my $out;
my $url = $q->url(-relative=>1, -query=>1);
my $html = dbquery($dbh, qq/
SELECT html FROM htmlcache WHERE url = "$url"
/);
if ($html) {
#print "
READING FROM CACHE...
";
print $html;
exit;
}
# how many words before? how many words after?
my $before = $q->param('before');
if (! defined $before) { $before = 2 }
my $after = $q->param('after');
if (! defined $after) { $after = 2 }
# get words from arguments (separated by space)
my $id = $q->param('id') || 1;
$id =~ s/[^\d ]//g;
#my $restrict= $q->param('restrict');
#my $restrict_line = " ";
#if ($restrict eq "positive") {
# $restrict_line = "AND rating_id > 74";
#} elsif ($restrict eq "negative") {
# $restrict_line = "AND rating_id < 50";
#}
my @word_ids = split(/ /, $id);
my @sentence_ids;
$out .= "words that appear next to:
";
my %seen_words;
# for each word, print words at the top, find word_sentence.position,
# and find other words with positions near it...
my $stuff_to_print;
my %word_id_to_word;
my %word_id_to_sentence_and_position;
foreach my $word_id (@word_ids) {
$word_id = substr($word_id, 0, 10);
#print words at the top
my $word = dbquery($dbh, qq/
SELECT word FROM word WHERE id = $word_id LIMIT 1
/);
$stuff_to_print .= "$word, ";
$seen_words{$word} = 1;
$word_id_to_word{$word_id} = $word;
# find word sentences and positions
my $sentence_and_position = dbquery($dbh, qq/
SELECT sentence_id,position
FROM word_sentence
WHERE word_id = $word_id
/);
foreach my $s_and_p (@$sentence_and_position) {
push @{$word_id_to_sentence_and_position{$word_id}}, $s_and_p;
}
}
$stuff_to_print =~ s/, $//;
$out .= "$stuff_to_print
";
# give a form for people to decide how many words they're looking at
$out .= $q->start_form(-method=>'GET') .
"words before:" . $q->textfield(-name=>'before',
-default=>$before,
-size=>2) .
"words after:" . $q->textfield(-name=>'after',
-default=>$after,
-size=>2) .
$q->hidden(-name=>'id', -default=>$id) .
" ".$q->submit(-value=>'Submit') .
$q->end_form .
"\n";
# next, find other words near the positions we collected
my %word_counts;
my %seen_sentence;
my $seen_sentence_count;
foreach my $word_id (keys %word_id_to_sentence_and_position) {
# look at each sentence/position pair
foreach my $s_and_p_pair (@{$word_id_to_sentence_and_position{$word_id}}) {
my $sentence_id = $$s_and_p_pair[0];
push (@sentence_ids, $sentence_id);
next if $seen_sentence{$sentence_id};
my $position = $$s_and_p_pair[1];
my $begin = $position - $before;
my $end = $position + $after;
foreach my $current_position ($begin .. $end) {
next if ($current_position == $position);
#print "current position is $current_position
";
# count these words...
my $new_word = dbquery($dbh, qq/
SELECT word_id,word FROM word_sentence,word
WHERE sentence_id = $sentence_id
AND position = $current_position
AND word_id = word.id
LIMIT 1
/);
next unless $new_word;
my $new_word_id = $$new_word[0];
my $new_word_text = $$new_word[1];
next if ($seen_words{$new_word_text});
$word_counts{$new_word_id}++;
$word_id_to_word{$new_word_id} = $new_word_text;
}
$seen_sentence{$sentence_id} = 1;
$seen_sentence_count++;
}
}
# if we only have one word, there is an error...this is the workaround
#if (! ref $$other_words[0]) { $other_words = [ $other_words]; }
$out .= "\n";
# print 'em
my $count;
foreach my $other_word_id (sort { $word_counts{$b} <=> $word_counts{$a} } keys %word_counts) {
# next if we have more than one word, but we only saw this once.
#next if (($word_counts{$other_word_id} == 1) && ($#word_ids > 0));
# next if we only saw this word once
#next if ($word_counts{$other_word_id} == 1);
next if (length $word_id_to_word{$other_word_id} < 3);
next if ($word_id_to_word{$other_word_id} eq "the");
next if ($word_id_to_word{$other_word_id} eq "The");
next if ($word_id_to_word{$other_word_id} eq "and");
$count++;
$out .= qq%- $word_id_to_word{$other_word_id}: $word_counts{$other_word_id}%;
# print a more advanced joint KWIC here, one for each word...
$out .= " [joint KWIC with: ";
foreach my $word_id (@word_ids) {
$out .= qq%$word_id_to_word{$word_id} \n%;
}
$out .= "]";
last if $count == 500;
}
$out .= "
";
my $end_time = time();
my $time_elapsed = $end_time - $begin_time;
$out .= "\nquery time: $time_elapsed seconds\n";
my $total_sentences = $#sentence_ids + 1;
$out .= "
total sentences: $total_sentences, sentences after pruning: $seen_sentence_count\n";
$out .= $q->end_html();
print $out;
# now put this text in the database...
$url =~ s/"/\\"/g;
$out =~ s/"/\\"/g;
dbquery($dbh, qq/
INSERT INTO htmlcache (url,html) VALUES ("$url","$out")
/);