#!/usr/bin/perl

###############################################################################
#
#                        The MWE 2008 Shared Task
#
# A simple "QuickStart" script for ranking MWE candidates extracted from the
# Prague Dependency Treebank 2.0 provided with the frequency information
# in a form of contingency tables. It produces a ranked list of MWE
# candidates based on decreasing values of Pointwise Mutual Information
# -- a baseline association measure.
# 
# Example usage: ./rank.pl pdt-mwe-frequency-all.dat > rank.list
#
# Pavel Pecina <pecina@ufal.mff.cuni.cz>, Feb 18, 2008
#
###############################################################################

use strict;

if ($#ARGV != 0) {
  die "Usage: rank.pl pdt-mwe-frequency-all.dat\n";
}

my $file = $ARGV[0];
my ($lem1, $tag1, $rel1, $lem2, $tag2, $rel2, $A, $B, $C, $D);
my $key;
my %res;

# Open the input file

if ($file) {
  open( IN, $file ) or die "Error openning input file: $file !\n";
} else {
  die "Please specify the input file!\n";
}

# Go through the input file containing values from contingency table of 
# each MWE candidate; compute the asociation measure score and store it.

while (<IN>) {

  # parse each input line and get contingency table values 
  ($lem1, $tag1, $rel1, $lem2, $tag2, $rel2, $A, $B, $C, $D) = split; 

  # candidate identifier
  $key = join ("\t", ($lem1, $tag1, $rel1, $lem2, $tag2, $rel2));

  # compute and store the Pointwise Mutual Information score
  $res{$key} = log( ($A*($A+$B+$C+$D))/(($A+$B)*($A+$C)));
}

# Sort the list of candidates according to the Pointwise Mutual Information
# scores and print it to standard output.

foreach $key ( sort { $res{$b} <=> $res{$a} } keys %res ) {

  print $key."\n";
}

close(IN);
