#!/usr/bin/perl -T
use strict;
use CGI qw(:standard);
use CGI::Carp qw(fatalsToBrowser);
use DBI;
use Reviews;
$CGI::POST_MAX=1024 * 15; # max 20K posts
$CGI::DISABLE_UPLOADS = 1; # no uploads
$ENV{PATH} = "";
my $q = new CGI;
print $q->header(-expires=>'now');
print $q->start_html();
my $begin_time = time();
my $dbh = make_db_handle;
# ok, first...if we've seen this url before, just spit back what
# we returned last time.
# if we haven't seen it before, we need to cache our results at
# the end.
my $out;
my $url = $q->url(-relative=>1, -query=>1);
my $html = dbquery($dbh, qq/
SELECT html FROM htmlcache WHERE url = "$url"
/);
if ($html) {
#print "
READING FROM CACHE...
";
print $html;
exit;
}
#my $order = $q->param('sort') || "totDESC";
#$order =~ s/[^A-Za-z,]//g;
#$order = substr($order, 0, 30);
#$order =~ s/desc/DESC/g;
#$order =~ s/DESC/ DESC/g;
#my $limit = $q->param('limit') || 100; #only return 100 at a time
#$limit =~ s/[^\d]//g;
#$limit = substr($limit, 0, 4);
#my $offset = $q->param('offset') || 0;
#$offset =~ s/[^\d]//g;
#$offset = substr($offset, 0, 7);
# not interested in words that occur less than the "floor"
#my $floor = $q->param('floor') || 50;
#$floor =~ s/[^\d]//g;
#$floor = substr($floor, 0, 4);
# get words from arguments (separated by space)
my $id = $q->param('id') || 1;
$id =~ s/[^\d ]//g;
my @word_ids = split(/ /, $id);
my @sentence_ids;
$out .= "words that coexist with:
";
my %seen_words;
# for each word, print words at the top, find sentences...
my $stuff_to_print;
my %word_id_to_word;
foreach my $word_id (@word_ids) {
$word_id = substr($word_id, 0, 10);
#print words at the top
my $word = dbquery($dbh, qq/
SELECT word FROM word WHERE id = $word_id
/);
$stuff_to_print .= "$word, ";
$seen_words{$word} = 1;
$word_id_to_word{$word_id} = $word;
# find all sentences containing these words
my $sentence_ids = dbquery($dbh, qq/
SELECT sentence_id FROM word_sentence
WHERE word_id = $word_id
/);
push(@sentence_ids, @$sentence_ids);
}
$stuff_to_print =~ s/, $//;
$out .= "$stuff_to_print
";
# next, find all other words in the sentences we collected
my %word_counts;
my %id_to_word;
my %seen_sentence;
my $seen_sentence_count;
foreach my $sentence_id (@sentence_ids) {
next if $seen_sentence{$sentence_id};
my $other_words = dbquery($dbh, qq/
SELECT word_id,word FROM word_sentence,word
WHERE sentence_id = $sentence_id
AND word_id = word.id
/);
$seen_sentence{$sentence_id} = 1;
$seen_sentence_count++;
# if we only have one word, there is an error...this is the workaround
if (! ref $$other_words[0]) { $other_words = [ $other_words]; }
foreach my $other_word (@$other_words) {
my $other_word_id = $$other_word[0];
my $word_text = $$other_word[1];
# don't count this word if it's the same as one of the
# words we already have
next if ($seen_words{$word_text});
$word_counts{$other_word_id}++;
$id_to_word{$other_word_id} = $word_text;
}
}
$out .= "\n";
# print 'em
my $count;
foreach my $other_word_id (sort { $word_counts{$b} <=> $word_counts{$a} } keys %word_counts) {
# next if we have more than one word, but we only saw this once.
#next if (($word_counts{$other_word_id} == 1) && ($#word_ids > 0));
# next if we only saw this word once.
#next if ($word_counts{$other_word_id} == 1);
next if (length $id_to_word{$other_word_id} < 3);
next if ($id_to_word{$other_word_id} eq "the");
next if ($id_to_word{$other_word_id} eq "The");
next if ($id_to_word{$other_word_id} eq "and");
$count++;
$out .= qq%- $id_to_word{$other_word_id}: $word_counts{$other_word_id}%;
# print a more advanced joint KWIC here, one for each word...
$out .= " [joint KWIC with: ";
foreach my $word_id (@word_ids) {
$out .= qq%$word_id_to_word{$word_id} \n%;
}
$out .= "]";
last if $count == 500;
}
$out .= "
";
my $end_time = time();
my $time_elapsed = $end_time - $begin_time;
$out .= "\nquery time: $time_elapsed seconds\n";
my $total_sentences = $#sentence_ids + 1;
$out .= "
total sentences: $total_sentences, sentences after pruning: $seen_sentence_count\n";
print $out;
print $q->end_html();
# now put this text in the database...
$url =~ s/"/\\"/g;
$out =~ s/"/\\"/g;
dbquery($dbh, qq/
INSERT INTO htmlcache (url,html) VALUES ("$url","$out")
/);