/* author: Chris O'Neill date: Feb 2001 comments: Porter Stemmming Algorithm */ import java.io.*; import java.lang.*; import java.util.*; /******************************** *Class: Porter * * * *Purpose: Stemming Algorithm * ********************************/ public class Porter { private String stemValue; private boolean preStrip; private String rulesFile; private Vector prefixes; private Vector suffixes2; private Vector suffixes3; private Vector suffixes4; /************************************ *Method: Porter * * * *Purpose: init * ************************************/ public Porter (String rules, String pre) { prefixes = new Vector(); suffixes2 = new Vector(); suffixes3 = new Vector(); suffixes4 = new Vector(); rulesFile= rules; if (pre.equals("/p")) { preStrip=true; } readRules(); } /***************Private Methods***********/ /**************************************************************** *Method: readRules * *Purpose: remove all non letter or digit characters from * * srt and return * ****************************************************************/ private void readRules() { String text; StringTokenizer line = new StringTokenizer(""); try { FileReader fr = new FileReader(rulesFile); BufferedReader br = new BufferedReader(fr); try { while ((text=br.readLine())!= null) { line= new StringTokenizer(text); if (line.hasMoreTokens()) { // read word from line and stem word String word = new String(); word = (String) line.nextToken(); //prefixes if (word.equals("#prefixes")) { word = " "; while ((word.charAt(0)!= '#') && ((text=br.readLine())!= null)) { line= new StringTokenizer(text); word = new String(); word=line.nextToken(); while ((word.charAt(0)!='#') && line.hasMoreTokens()) { // read word from line and stem word if (word.equals("()")) { word=""; } prefixes.addElement(word); word = new String(); word = line.nextToken(); if (word.equals("()")) { word=""; } prefixes.addElement(word); word = new String(); word=" "; } } } //suffix2 if (word.equals("#suffixes2")) { word =" "; while ((word.charAt(0)!= '#')&&((text=br.readLine())!= null) ) { line= new StringTokenizer(text); word = new String(); word=line.nextToken(); while ((word.charAt(0)!='#') && line.hasMoreTokens()) { // read word from line and stem word if (word.equals("()")) { word=""; } suffixes2.addElement(word); word = new String(); word = line.nextToken(); if (word.equals("()")) { word=""; } suffixes2.addElement(word); word = new String(); word=" "; } } } //suffix3 if (word.equals("#suffixes3")) { word =" "; while ((word.charAt(0)!= '#')&& ((text=br.readLine())!= null)) { line= new StringTokenizer(text); word = new String(); word=line.nextToken(); while ((word.charAt(0)!='#') && line.hasMoreTokens()) { // read word from line and stem word if (word.equals("()")) { word=""; } suffixes3.addElement(word); word = new String(); word = line.nextToken(); if (word.equals("()")) { word=""; } suffixes3.addElement(word); word = new String(); word=" "; } } } //suffix4 if (word.equals("#suffixes4")) { word =" "; while ((word.charAt(0)!= '#')&& ((text=br.readLine())!= null)) { line= new StringTokenizer(text); word = new String(); word=line.nextToken(); while ((word.charAt(0)!='#') && line.hasMoreTokens()) { // read word from line and stem word if (word.equals("()")) { word=""; } suffixes4.addElement(word); word = new String(); word = line.nextToken(); if (word.equals("()")) { word=""; } suffixes4.addElement(word); word = new String(); word=" "; } } } } } } catch(Exception e) { System.err.println("File Error Durring Reading "+e); System.exit(0); } // try to close file, file is not needed again so if can't close don't exit try { fr.close(); } catch(Exception e) { System.err.println("Error Closing File During Reading " + e); } } catch(Exception e) { System.err.println("File Not Found " + rulesFile + " exception " + e); System.exit(1); } } // end of readRules method /***************Util Methods**************/ /**************************************************************** *Method: Clean * *Returns: String * *Recievs: String str * *Purpose: remove all non letter or digit characters from * * srt and return * ****************************************************************/ private String Clean( String word ) { String clean = ""; for ( int i=0; i < word.length(); i++ ) { if (Character.isLetterOrDigit(word.charAt(i))) { clean += word.charAt(i); } } return clean; } // end of clean method /**************************************************************** *Method: hasSuffix * *Returns: boolean * *Recievs: String word, Strig suffix, NewString stem * *Purpose: returns true if word has the suffix suffix and * * nulls stem if (suffix is longer than word and * * the suffix is more than 1 char and the second * * to last char of suffix doesn't match with the * * second to last char of word) * ****************************************************************/ private boolean hasSuffix( String word, String suffix) { String temp = ""; // if suffix is longer than word return false if ( word.length() <= suffix.length() ) { return false; } //if the suffix is more than 1 char and the second to last char of the suffix //doesn't match with the second to last char of the the word then return false if ((suffix.length()>1) && (word.charAt(word.length()-2) !=suffix.charAt(suffix.length()-2))) { return false; } //make stem = to word - suffix stemValue = word.substring(0,(word.length()-suffix.length())); //make tmp = to ste temp = stemValue; //add the suffix to tmp temp += suffix; //if the word is eaual to the word minus the length of the suffix +the suffix then //the suffix is part of the word if not is is not if (temp.compareTo(word) == 0 ) { return true; } else { return false; } } /**************************************************************** *Method: vowel * *Returns: boolean * *Recievs: char ch, char prev * *Purpose: determin whether ch is a vowel or not uses prev * * determination when ch == y * ****************************************************************/ private boolean vowel(char ch, char prev) { switch (ch) { case 'a': case 'e': case 'i': case 'o': case 'u': return true; case 'y': { switch ( prev ) { case 'a': case 'e': case 'i': case 'o': case 'u': return false; default: return true; } } default : return false; } } /************************************************************ *Method: measure * *Returns: int * *Recievs: String stem * *Purpose: counts the number of vowels followed by a * * constanant * ************************************************************/ private int measure(String stem) { int count = 0; for (int i =0; i < stem.length()-1; i++) { if (i==0) { if (vowel(stem.charAt(i), 'a')) { if (!vowel(stem.charAt(i+1), stem.charAt(i))) { count++; } } } else if (vowel(stem.charAt(i), stem.charAt(i-1))) { if (!vowel(stem.charAt(i+1), stem.charAt(i))) { count++; } } } return(count); } /************************************************************ *Method: containsVowel * *Returns: boolean * *Recievs: String word * *Purpose : returns true if string contains a vowel * * * ************************************************************/ private boolean containsVowel( String word ) { for (int i=0 ; i < word.length(); i++ ) { if ( i > 0 ) { if ( vowel(word.charAt(i),word.charAt(i-1)) ) { return true; } } else { if ( vowel(word.charAt(0),'a') ) { return true; } } } return false; } /**************************************************************** *Method: cvc * *Returns: boolean * *Recievs: String s * *Purpose: returns true if the last three letters of * * string are constanant vowel constanant and the * * final constanant is not a w,x or y * *****************************************************************/ private boolean cvc( String str ) { int length=str.length(); if ( length < 3 ) { return false; } if ( (!vowel(str.charAt(length-1),str.charAt(length-2)) ) && (str.charAt(length-1) != 'w') && (str.charAt(length-1) != 'x') && (str.charAt(length-1) != 'y') && (vowel(str.charAt(length-2),str.charAt(length-3))) ) { if (length == 3) { if (!vowel(str.charAt(0),'?')) { return true; } else { return false; } } else { if(!vowel(str.charAt(length-3),str.charAt(length-4)) ) { return true; } else { return false; } } } return false; } /**************** Remove Suffixes ******************/ /**************************************************************** *Method: stripSuffixes * *Returns: String * *Recievs: String str * *Purpose: calls the five steps of suffix removal of the * * porter stemming algorithm returning the stem * ****************************************************************/ private String stripSuffixes( String str ) { str = step1( str ); if ( str.length() >= 1 ) { str = step2( str ); if ( str.length() >= 1 ) { str = step3( str ); if ( str.length() >= 1 ) { str = step4( str ); if ( str.length() >= 1 ) { str = step5( str ); } } } } return str; } /************************************************************ *Method: step1 * *Returns: String * *Recievs: String str * *Purpose: performs step one of the porter stemming * * algorithm removing plaurals * ************************************************************/ private String step1(String str) { stemValue=""; /*Step1a **************/ if ( str.charAt( str.length()-1 ) == 's' ) { if ( (hasSuffix( str, "sses")) || (hasSuffix( str,"ies")) ) { str = str.substring(0,str.length()-2); } else { if ( ( str.length() == 1 ) && (str.charAt(str.length()-1) == 's' ) ) { str = ""; return str; } if ( str.charAt( str.length()-2 ) != 's' ) { str= str.substring(0, (str.length()-1)); } } } /*Step1b **************/ if ( hasSuffix( str,"eed") ) { if ( measure( stemValue) > 0 ) { str= str.substring(0,(str.length()-1)); } } else { if ((hasSuffix( str,"ed")) || (hasSuffix(str,"ing")) ) { if (containsVowel( stemValue )) { str = str.substring(0, stemValue.length()); if ( str.length() == 1 ) { return str; } if (( hasSuffix( str,"at") ) || (hasSuffix( str,"bl") ) || ( hasSuffix( str,"iz") ) ) { str += "e"; } else { int length = str.length(); if ( (str.charAt(length-1) ==str.charAt(length-2)) && (str.charAt(length-1) != 'l') &&(str.charAt(length-1) != 's') && (str.charAt(length-1) != 'z') ) { str = str.substring(0,length-1); } else { if ( measure( str ) == 1 ) { if ( cvc(str) ) { str += "e"; } } } } } } } /*Step1c **************/ if ( hasSuffix(str,"y") ) { if ( containsVowel( stemValue ) ) { str= str.substring(0,str.length()-1); str += "i"; } } return str; } /************************************************************ *Method: step2 * *Returns: String * *Recievs: String str * *Purpose: performs step two of the porter stemming * * algorithm * ************************************************************/ private String step2( String str ) { stemValue=""; for ( int index = 0 ; index < suffixes2.size(); index+=2 ) { if ( hasSuffix ( str, (String) suffixes2.elementAt(index) ) ) { if ( measure ( stemValue ) > 0 ) { str = stemValue + (String) suffixes2.elementAt(index+1); return str; } } } return str; } /************************************************************ *Method: step3 * *Returns: String * *Recievs: String str * *Purpose: performs step three of the porter stemming * * algorithm * ************************************************************/ private String step3( String str ) { stemValue=""; for ( int index = 0 ; index 0 ) { str = stemValue + (String) suffixes3.elementAt(index+1); return str; } } } return str; } /************************************************************ *Method: step4 * *Returns: String * *Recievs: String str * *Purpose: performs step four of the porter stemming * * algorithm * *************************************************************/ private String step4( String str ) { stemValue = ""; for ( int index = 0 ; index 1 ) { str = stemValue; return str; } } } return str; } /************************************************************ *Method: step5 * *Returns: String * *Recievs: String str * *Purpose: performs step five of the porter stemming * * algorithm * ************************************************************/ private String step5( String str ) { /*Step5a **************/ if ( str.charAt(str.length()-1) == 'e' ) { if ( measure(str) > 1 ) { str= str.substring(0, str.length()-1); } else { if (measure(str) == 1) { String stem = str.substring(0,str.length()-1); if (!cvc(stem)) { str = stem; } } } } /*Step5b **************/ if ( str.length() == 1 ) { return str; } if ( (str.charAt(str.length()-1) == 'l') && (str.charAt(str.length()-2) == 'l') && (measure(str) > 1) ) { str = str.substring(0,str.length()-1); } return str; } /************************************************************ *Method: stripPrefixes * *Returns: String * *Recievs: String str * *Purpose: removes prefixes so that suffix removal can * * comence * ************************************************************/ private String stripPrefixes ( String str) { for ( int i=0 ; i ((String) prefixes.elementAt(i)).length())) { str= str.substring(((String) prefixes.elementAt(i)).length()); return str; } } return str; } /*************************** PUBLIC METHODS ****************/ /************************************************************ *Method: stripAffixes * *Returns: String * *Recievs: String str * *Purpose: prepares string and calls stripPrefixes and * * stripSuffixes * ************************************************************/ public String stripAffixes( String str ) { str = str.toLowerCase(); //change all letters in the input to lowercase str = Clean(str); // remove all chars from string that are not a letter or a digit (why digit?) if ( str.length() > 2) //if str's length is greater than 2 then remove prefixes { if (preStrip) { str = stripPrefixes(str); } if (str != "" ) // if str is not null remove suffix { str = stripSuffixes(str); } } return str; } //stripAffixes //***************MAIN METHOD*************** /******************************************************** *Method: main * *Returns: void * *Recives: String args[0] input file * * String args[1] output file * * String args[2] rules file * * String args[3] prestripping * *Purpose: creates instance of Porter and calles * * stripAffixes * ********************************************************/ public static void main(String args[] ) { Porter p = new Porter(args[2],args[3]); String text; StringTokenizer line = new StringTokenizer(""); String filein = args[0]; String fileout = args[1]; try { FileWriter fw = new FileWriter(fileout); BufferedWriter bw = new BufferedWriter(fw); FileReader fr = new FileReader(filein); BufferedReader br = new BufferedReader(fr); try { while ((text=br.readLine())!= null) { line= new StringTokenizer(text); try { while (line.hasMoreTokens()) { // read word from line and stem word String word = new String(); word = line.nextToken(); bw.write(p.stripAffixes(word)+" "); } bw.newLine(); } catch (Exception e) { System.err.println(e); } } } catch(Exception e) { System.err.println("File Error Durring Reading"+e); System.exit(0); } // try to close file, file is not needed again so if can't close don't exit try { fr.close(); } catch(Exception e) { System.err.println("Error Closing File During Reading"); } try { bw.close(); } catch(Exception e) { System.err.println(e); } } catch(Exception e) { System.err.println("Input File" + filein +"not found"); System.exit(1); } }//main } //class