#!/usr/bin/perl -T use strict; use CGI qw(:standard); use CGI::Carp qw(fatalsToBrowser); use DBI; use Reviews; $CGI::POST_MAX=1024 * 15; # max 20K posts $CGI::DISABLE_UPLOADS = 1; # no uploads $ENV{PATH} = ""; my $q = new CGI; print $q->header(-expires=>'now'); print $q->start_html(); my $begin_time = time(); my $dbh = make_db_handle; # ok, first...if we've seen this url before, just spit back what # we returned last time. # if we haven't seen it before, we need to cache our results at # the end. my $out; my $url = $q->url(-relative=>1, -query=>1); my $html = dbquery($dbh, qq/ SELECT html FROM htmlcache WHERE url = "$url" /); if ($html) { #print "

READING FROM CACHE...

"; print $html; exit; } #my $order = $q->param('sort') || "totDESC"; #$order =~ s/[^A-Za-z,]//g; #$order = substr($order, 0, 30); #$order =~ s/desc/DESC/g; #$order =~ s/DESC/ DESC/g; #my $limit = $q->param('limit') || 100; #only return 100 at a time #$limit =~ s/[^\d]//g; #$limit = substr($limit, 0, 4); #my $offset = $q->param('offset') || 0; #$offset =~ s/[^\d]//g; #$offset = substr($offset, 0, 7); # not interested in words that occur less than the "floor" #my $floor = $q->param('floor') || 50; #$floor =~ s/[^\d]//g; #$floor = substr($floor, 0, 4); # get words from arguments (separated by space) my $id = $q->param('id') || 1; $id =~ s/[^\d ]//g; my @word_ids = split(/ /, $id); my @sentence_ids; $out .= "

words that coexist with:
"; my %seen_words; # for each word, print words at the top, find sentences... my $stuff_to_print; my %word_id_to_word; foreach my $word_id (@word_ids) { $word_id = substr($word_id, 0, 10); #print words at the top my $word = dbquery($dbh, qq/ SELECT word FROM word WHERE id = $word_id /); $stuff_to_print .= "$word, "; $seen_words{$word} = 1; $word_id_to_word{$word_id} = $word; # find all sentences containing these words my $sentence_ids = dbquery($dbh, qq/ SELECT sentence_id FROM word_sentence WHERE word_id = $word_id /); push(@sentence_ids, @$sentence_ids); } $stuff_to_print =~ s/, $//; $out .= "$stuff_to_print

"; # next, find all other words in the sentences we collected my %word_counts; my %id_to_word; my %seen_sentence; my $seen_sentence_count; foreach my $sentence_id (@sentence_ids) { next if $seen_sentence{$sentence_id}; my $other_words = dbquery($dbh, qq/ SELECT word_id,word FROM word_sentence,word WHERE sentence_id = $sentence_id AND word_id = word.id /); $seen_sentence{$sentence_id} = 1; $seen_sentence_count++; # if we only have one word, there is an error...this is the workaround if (! ref $$other_words[0]) { $other_words = [ $other_words]; } foreach my $other_word (@$other_words) { my $other_word_id = $$other_word[0]; my $word_text = $$other_word[1]; # don't count this word if it's the same as one of the # words we already have next if ($seen_words{$word_text}); $word_counts{$other_word_id}++; $id_to_word{$other_word_id} = $word_text; } } $out .= "
    \n"; # print 'em my $count; foreach my $other_word_id (sort { $word_counts{$b} <=> $word_counts{$a} } keys %word_counts) { # next if we have more than one word, but we only saw this once. #next if (($word_counts{$other_word_id} == 1) && ($#word_ids > 0)); # next if we only saw this word once. #next if ($word_counts{$other_word_id} == 1); next if (length $id_to_word{$other_word_id} < 3); next if ($id_to_word{$other_word_id} eq "the"); next if ($id_to_word{$other_word_id} eq "The"); next if ($id_to_word{$other_word_id} eq "and"); $count++; $out .= qq%
  1. $id_to_word{$other_word_id}: $word_counts{$other_word_id}%; # print a more advanced joint KWIC here, one for each word... $out .= " [joint KWIC with: "; foreach my $word_id (@word_ids) { $out .= qq%$word_id_to_word{$word_id} \n%; } $out .= "]"; last if $count == 500; } $out .= "
"; my $end_time = time(); my $time_elapsed = $end_time - $begin_time; $out .= "\n

query time: $time_elapsed seconds\n"; my $total_sentences = $#sentence_ids + 1; $out .= "
total sentences: $total_sentences, sentences after pruning: $seen_sentence_count\n"; print $out; print $q->end_html(); # now put this text in the database... $url =~ s/"/\\"/g; $out =~ s/"/\\"/g; dbquery($dbh, qq/ INSERT INTO htmlcache (url,html) VALUES ("$url","$out") /);