/* author: Christopher O'Neill date: Feb 2001 comments: Stemmer Eavaluation Program also forms a suite of individual evaluation programs which may be run sepratly from the Main() prog as methods of the object*/ import java.io.*; import java.lang.*; import java.util.*; /************************************************************ *Class: Evaluation * * * *Purpose: Stemming Evaluation * * uses modified Hamming distace defined in fox * * and frakes strength and similarity of affix * * removal stemming algorthms includes option for * * using modified algorithm adpted by Chris O'Neill* * 2001 to support PrefixRemoval along side suffix * * removal. Also included a similarty metric using * * the hamming distance measure designed by Chris * * O'Neill which gives a percentage value for * * similarity of output files. * *************************************************************/ public class Evaluation { /**************************************************************** *Method: Clean * *Returns: String * *Recievs: String str * *Purpose: remove all non letter or digit characters from * * word and toLowerCase then return * ****************************************************************/ private String Clean( String word ) { String clean = ""; for ( int i=0; i < word.length(); i++ ) { if (Character.isLetterOrDigit(word.charAt(i))) { clean += word.charAt(i); } } clean = clean.toLowerCase(); //change all letters in the input to lowercase return clean; } // end of clean method /************************************************************ *Method: MeanConflation * *Returns: float average * *Receives: String sourceFile, String stemmedFile * *Purpose: mean no of words per conflation class * ************************************************************/ public float MeanConflation (String sourceFile, String stemmedFile) { float average=0; Vector stems = new Vector(); boolean newstem; String text; String word; StringTokenizer line = new StringTokenizer(""); try { FileReader fr = new FileReader(sourceFile); BufferedReader br = new BufferedReader(fr); while ((text=br.readLine())!= null) { line= new StringTokenizer(text); while (line.hasMoreTokens()) { word = line.nextToken(); average+=1; } } br.close(); fr.close(); } catch (Exception e) { System.err.println("Error Reading From Source File " + sourceFile + " error " + e); System.exit(0); } try { FileReader fr = new FileReader(stemmedFile); BufferedReader br = new BufferedReader(fr); while ((text=br.readLine())!= null) { line= new StringTokenizer(text); while (line.hasMoreTokens()) { word = new String(); word = line.nextToken(); newstem=true; for (int i=0; i max) { max = ((Integer) remCount.elementAt(i)).intValue(); Integer x = new Integer(i); modeCharRem.removeAllElements(); modeCharRem.addElement(x); } } return modeCharRem; } /**************************************************************** *Method: medianCharRem * *Returns: float medianCharRem * *Recievs: String sourceFile, String stemmedFile * *Purpose: median number of chars removed per term * ****************************************************************/ public float MedianCharRem (String sourceFile, String stemmedFile) { int charRem=0; float noWords=0; Vector remCount = new Vector(); String text1; String word1; StringTokenizer line1 = new StringTokenizer(""); String text2; String word2; StringTokenizer line2 = new StringTokenizer(""); try { FileReader fr1 = new FileReader(sourceFile); BufferedReader br1 = new BufferedReader(fr1); FileReader fr2 = new FileReader(stemmedFile); BufferedReader br2 = new BufferedReader(fr2); while (((text1=br1.readLine())!= null)&((text2=br2.readLine())!= null)) { line1= new StringTokenizer(text1); line2= new StringTokenizer(text2); while (line1.hasMoreTokens()) { word1 = Clean(line1.nextToken()); word2 = Clean(line2.nextToken()); charRem = CharRem(word1,word2); noWords+=1.0; while (remCount.size() <= charRem) { Integer x = new Integer(0); remCount.addElement(x); } Integer x = new Integer(((Integer) remCount.elementAt(charRem)).intValue() +1); remCount.setElementAt(x,charRem); } } } catch (Exception e) { System.err.println("File Error Durring reading " + e); } noWords=noWords/2; int max=0; charRem=0; float medianCharRem =0; for (int i=0; i0) { noWords = noWords - (((Integer) remCount.elementAt(i)).intValue()); medianCharRem=i; if (noWords==0.5) { noWords=0; medianCharRem+=0.5; } } } return medianCharRem; } /**************************************************************** *Method: TableCharRem * *Returns: Vector tableCharRem * *Recievs: String sourceFile, String stemmedFile * *Purpose: mode number of chars removed per term * ****************************************************************/ public Vector TableCharRem (String sourceFile, String stemmedFile) { int charRem=0; Vector tableCharRem = new Vector(); String text1; String word1; StringTokenizer line1 = new StringTokenizer(""); String text2; String word2; StringTokenizer line2 = new StringTokenizer(""); try { FileReader fr1 = new FileReader(sourceFile); BufferedReader br1 = new BufferedReader(fr1); FileReader fr2 = new FileReader(stemmedFile); BufferedReader br2 = new BufferedReader(fr2); while (((text1=br1.readLine())!= null)&((text2=br2.readLine())!= null)) { line1= new StringTokenizer(text1); line2= new StringTokenizer(text2); while (line1.hasMoreTokens()) { word1 = Clean(line1.nextToken()); word2 = Clean(line2.nextToken()); charRem = CharRem(word1,word2); while (tableCharRem.size() <= charRem) { Integer x = new Integer(0); tableCharRem.addElement(x); } int x = ((Integer)tableCharRem.elementAt(charRem)).intValue(); x++; Integer y= new Integer(x); tableCharRem.setElementAt(y,charRem); } } } catch (Exception e) { System.err.println("File Error Durring reading " + e); } return tableCharRem; } /**************************************************************** *Method: CharRem * *Returns: int charRem * *Recievs: String sourceTerm, String stemmedTerm * *Purpose: number of chars removed by stemming * ****************************************************************/ public int CharRem (String sourceT, String stemmedT) { String sourceTerm =Clean(sourceT); String stemmedTerm =Clean(stemmedT); if (sourceTerm.length() max) { max = ((Integer) distCount.elementAt(i)).intValue(); Integer x = new Integer(i); modeHammDist.removeAllElements(); modeHammDist.addElement(x); } } return modeHammDist; } /**************************************************************** *Method: medianHammingDist * *Returns: float medianHammDist * *Recievs: String sourceFile, String stemmedFile, * * boolean pre * *Purpose: median modified Hamming distance per term use * * pre==true if prefix stripping is used * ****************************************************************/ public float MedianHammingDist (String sourceFile, String stemmedFile, boolean pre) { int hammDist=0; float noWords=0; Vector distCount = new Vector(); String text1; String word1; StringTokenizer line1 = new StringTokenizer(""); String text2; String word2; StringTokenizer line2 = new StringTokenizer(""); try { FileReader fr1 = new FileReader(sourceFile); BufferedReader br1 = new BufferedReader(fr1); FileReader fr2 = new FileReader(stemmedFile); BufferedReader br2 = new BufferedReader(fr2); while (((text1=br1.readLine())!= null)&((text2=br2.readLine())!= null)) { line1= new StringTokenizer(text1); line2= new StringTokenizer(text2); while (line1.hasMoreTokens()) { word1 = Clean(line1.nextToken()); word2 = Clean(line2.nextToken()); if (pre) { hammDist = PreHammDist(word1,word2); } else { hammDist = HammDist(word1,word2); } noWords+=1.0; while (distCount.size() <= hammDist) { Integer x = new Integer(0); distCount.addElement(x); } Integer x = new Integer(((Integer) distCount.elementAt(hammDist)).intValue() +1); distCount.setElementAt(x,hammDist); } } } catch (Exception e) { System.err.println("File Error Durring reading " + e); } noWords=noWords/2; int max=0; hammDist=0; float medianHammDist =0; for (int i=0; i0) { noWords = noWords - ((Integer) distCount.elementAt(i)).intValue(); medianHammDist=i; if (noWords==0.5) { noWords=0; medianHammDist+=0.5; } } } return medianHammDist; } /**************************************************************** *Method: TableHammingDist * *Returns: Vector tableHammDist * *Recievs: String sourceFile, String stemmedFile, * * boolean pre * *Purpose: tale of modified Hamming distance per term use * * pre==true if prefix stripping is used * ****************************************************************/ public Vector TableHammingDist (String sourceFile, String stemmedFile, boolean pre) { int hammDist=0; Vector tableHammDist = new Vector(); String text1; String word1; StringTokenizer line1 = new StringTokenizer(""); String text2; String word2; StringTokenizer line2 = new StringTokenizer(""); try { FileReader fr1 = new FileReader(sourceFile); BufferedReader br1 = new BufferedReader(fr1); FileReader fr2 = new FileReader(stemmedFile); BufferedReader br2 = new BufferedReader(fr2); while (((text1=br1.readLine())!= null)&((text2=br2.readLine())!= null)) { line1= new StringTokenizer(text1); line2= new StringTokenizer(text2); while (line1.hasMoreTokens()) { word1 = Clean(line1.nextToken()); word2 = Clean(line2.nextToken()); if (pre) { hammDist = PreHammDist(word1,word2); } else { hammDist = HammDist(word1,word2); } while (tableHammDist.size() <= hammDist) { Integer x =new Integer(0); tableHammDist.addElement(x); } Integer x = new Integer(((Integer) tableHammDist.elementAt(hammDist)).intValue() +1); tableHammDist.setElementAt(x,hammDist); } } } catch (Exception e) { System.err.println("File Error Durring reading " + e); } return tableHammDist; } /**************************************************************** *Method: PreHammDist * *Returns: int hammDist * *Recievs: String sourceT, String stemmedT * *Purpose: modified Hamming distance per term * ****************************************************************/ public int PreHammDist (String sourceT, String stemmedT) { String sourceTerm = Clean(sourceT); String stemmedTerm = Clean(stemmedT); String padding =""; int hammDist=sourceTerm.length(); int temp; if (sourceTerm.length()