#!/links/bin/perl -w # # pitchfork reviews project: # this script makes the statistics for each word and stores them in a # table called "statistics" # # loren jan wilson, november 2003 # use strict; use DBI; use Reviews; # get words from database and calculate: # 1. tot: total wordcount # 2. pos: positive article wordcount # 3. neg: negative article wordcount # 4. neu: neutral article wordcount # 5. pwr: positive word ratio # 6. nwr: negative word ratio # 7. avg: average rating of articles containing this word # 8. art: number of articles this word occurs in # 9. aut: number of authors who use this word # 10. wordscore: (average - average average) * tot # # this all goes into a "statistics" table. # my $dbh = make_db_handle; # get words my $words = dbquery($dbh, qq/SELECT id, word from word/); # get positive rating value my $positive_rating = dbquery($dbh, qq/SELECT value FROM configuration where variable like "positive_rating"/); # get negative rating value my $negative_rating = dbquery($dbh, qq/SELECT value FROM configuration where variable like "negative_rating"/); # lock tables dbquery($dbh, qq/ LOCK TABLES statistics WRITE, word READ, word_paragraph READ, paragraph READ, article READ, rating READ, author READ, album READ, band READ /); # delete everything in there... dbquery($dbh, qq/ DELETE FROM statistics /); my $word_counter; foreach my $wordref (@$words) { $word_counter++; if ($word_counter >= 100) { $word_counter = 0; dbquery($dbh, qq/UNLOCK TABLES/); dbquery($dbh, qq/ LOCK TABLES statistics WRITE, word READ, word_paragraph READ, paragraph READ, article READ, rating READ, author READ, album READ, band READ /); } my $word_id = $$wordref[0]; my $word = $$wordref[1]; my ($tot,$pos,$neg,$neu,$pwr,$nwr,$avg,$art,$aut,$score); my $avg_avg = "67.4"; # skip this word if it also exists in the band name or album name. #my $names = dbquery($dbh, qq/ # SELECT band.name,album.name # FROM word_paragraph,paragraph,article,album,band # WHERE paragraph_id = paragraph.id # AND article_id = article.id # AND album_id = album.id # AND band_id = band.id # AND word_id = $word_id #/); # total number of occurrences of this word $tot = dbquery($dbh, qq/ SELECT count(word_id) FROM word_paragraph WHERE word_id = $word_id /); # skip this word if it was used less than 10 times. next if $tot < 5; # total number of occurrences in positive articles $pos = dbquery($dbh, qq/ SELECT count(word_id) FROM word_paragraph, paragraph, article, rating WHERE word_id = $word_id AND word_paragraph.paragraph_id = paragraph.id AND paragraph.article_id = article.id AND article.rating_id = rating.id AND rating.id > $positive_rating /); # total number of occurrences in negative articles $neg = dbquery($dbh, qq/ SELECT count(word_id) FROM word_paragraph, paragraph, article, rating WHERE word_id = $word_id AND word_paragraph.paragraph_id = paragraph.id AND paragraph.article_id = article.id AND article.rating_id = rating.id AND rating.id < $negative_rating /); # total number of occurrences in articles neither positive nor negative $neu = dbquery($dbh, qq/ SELECT count(word_id) FROM word_paragraph, paragraph, article, rating WHERE word_id = $word_id AND word_paragraph.paragraph_id = paragraph.id AND paragraph.article_id = article.id AND article.rating_id = rating.id AND rating.id BETWEEN $negative_rating AND $positive_rating /); # positive word ratio if ($tot) { $pwr = sprintf("%.2f", $pos / $tot); } else { $pwr = "NULL"; } # negative word ratio if ($tot) { $nwr = sprintf("%.2f", $neg / $tot); } else { $nwr = "NULL"; } # average rating of articles containing this word $avg = dbquery($dbh, qq/ SELECT avg(rating.id) FROM word_paragraph, paragraph, article, rating WHERE word_id = $word_id AND word_paragraph.paragraph_id = paragraph.id AND paragraph.article_id = article.id AND article.rating_id = rating.id /); $avg = sprintf("%.2f", $avg); # distinct number of articles containing this word $art = dbquery($dbh, qq/ SELECT count(distinct article.id) FROM word_paragraph, paragraph, article WHERE word_id = $word_id AND word_paragraph.paragraph_id = paragraph.id AND paragraph.article_id = article.id /); # number of authors who used this word $aut = dbquery($dbh, qq/ SELECT count(distinct author.id) FROM word_paragraph, paragraph, article, author WHERE word_id = $word_id AND word_paragraph.paragraph_id = paragraph.id AND paragraph.article_id = article.id AND article.author_id = author.id /); # wordscore $score = ($avg - $avg_avg) * $tot; $score = sprintf("%.3f", $score); # stick these values into the statistics table... #dbquery($dbh, qq/ # DELETE FROM statistics WHERE word_id = $word_id #/); dbquery($dbh, qq/ INSERT INTO statistics (word_id, tot, pos, neg, neu, pwr, nwr, avg, art, aut, wordscore) VALUES ($word_id, $tot, $pos, $neg, $neu, $pwr, $nwr, $avg, $art, $aut, $score) /); } dbquery($dbh, qq/UNLOCK TABLES/);