/* 	author: 	Chris O'Neill
	date:		Feb 2001
   	comments: 	Porter Stemmming Algorithm
*/



import java.io.*;
import java.lang.*;
import java.util.*;






/********************************
*Class:		Porter				*
*								*
*Purpose:	Stemming Algorithm	*
********************************/
public class Porter
{
private String stemValue;
private boolean preStrip;
private String rulesFile;
private Vector prefixes;
private Vector suffixes2;
private Vector suffixes3;
private Vector suffixes4;


/************************************
*Method:	Porter					*
*									*
*Purpose:	init					*
************************************/
public Porter (String rules, String pre)
	{
	prefixes = new Vector();
	suffixes2 = new Vector();
	suffixes3 = new Vector();
	suffixes4 = new Vector();
	rulesFile= rules;
	if (pre.equals("/p"))
		{
		preStrip=true;
		}
	readRules();
	}


/***************Private Methods***********/
/****************************************************************
*Method:		readRules										*
*Purpose:		remove all non letter or digit characters from 	*
*				srt and return									*
****************************************************************/
private void readRules()
	{
	String text;
	StringTokenizer line = new StringTokenizer("");
	try
		{
		FileReader fr = new FileReader(rulesFile);
		BufferedReader br = new BufferedReader(fr);
		try
			{
			while ((text=br.readLine())!= null)
				{
				line= new StringTokenizer(text);
				if (line.hasMoreTokens())
					{
					// read word from line and stem word
					String word = new String();
					word = (String) line.nextToken();
					//prefixes
					if (word.equals("#prefixes"))
						{
						word = " ";
						while ((word.charAt(0)!= '#') && ((text=br.readLine())!= null))
							{
							line= new StringTokenizer(text);
							word = new String();
							word=line.nextToken();
							while ((word.charAt(0)!='#') && line.hasMoreTokens())
								{
								// read word from line and stem word
								if (word.equals("()"))
									{
									word="";
									}
								prefixes.addElement(word);
								word = new String();
								word = line.nextToken();
								if (word.equals("()"))
									{
									word="";
									}
								prefixes.addElement(word);
								word = new String();
								word=" ";
								}
							}
						}
					//suffix2
					if (word.equals("#suffixes2"))
						{
						word =" ";
						while  ((word.charAt(0)!= '#')&&((text=br.readLine())!= null) )
							{
							line= new StringTokenizer(text);
							word = new String();
							word=line.nextToken();
							while ((word.charAt(0)!='#') && line.hasMoreTokens())
								{
								// read word from line and stem word
								if (word.equals("()"))
									{
									word="";
									}
								suffixes2.addElement(word);
								word = new String();
								word = line.nextToken();
								if (word.equals("()"))
									{
									word="";
									}
								suffixes2.addElement(word);
								word = new String();
								word=" ";
								}
							}
						}
					//suffix3
					if (word.equals("#suffixes3"))
						{
						word =" ";
						while  ((word.charAt(0)!= '#')&& ((text=br.readLine())!= null))
							{
							line= new StringTokenizer(text);
							word = new String();
							word=line.nextToken();
							while ((word.charAt(0)!='#') && line.hasMoreTokens())
								{
								// read word from line and stem word
								if (word.equals("()"))
									{
									word="";
									}
								suffixes3.addElement(word);
								word = new String();
								word = line.nextToken();
								if (word.equals("()"))
									{
									word="";
									}
								suffixes3.addElement(word);
								word = new String();
								word=" ";
								}
							}
						}
					//suffix4
					if (word.equals("#suffixes4"))
						{
						word =" ";
						while  ((word.charAt(0)!= '#')&& ((text=br.readLine())!= null))
							{
							line= new StringTokenizer(text);
							word = new String();
							word=line.nextToken();
							while ((word.charAt(0)!='#') && line.hasMoreTokens())
								{
								// read word from line and stem word
								if (word.equals("()"))
									{
									word="";
									}
								suffixes4.addElement(word);
								word = new String();
								word = line.nextToken();
								if (word.equals("()"))
									{
									word="";
									}
								suffixes4.addElement(word);
								word = new String();
								word=" ";
								}
							}
						}
					}
				}
			}
		catch(Exception e)
			{
			System.err.println("File Error Durring Reading "+e);
			System.exit(0);
			}
			// try to close file, file is not needed again so if can't close don't exit
		try
			{
			fr.close();
			}
		catch(Exception e)
			{
			System.err.println("Error Closing File During Reading " + e);
			}
		}
	catch(Exception e)
		{
		System.err.println("File Not Found " + rulesFile + " exception " + e);
		System.exit(1);
		}
  	} // end of readRules method

/***************Util Methods**************/
/****************************************************************
*Method:		Clean											*
*Returns:		String											*
*Recievs:		String str										*
*Purpose:		remove all non letter or digit characters from 	*
*				srt and return									*
****************************************************************/
private String Clean( String word )
	{
	String clean = "";
	for ( int i=0; i < word.length(); i++ )
		{
	    if (Character.isLetterOrDigit(word.charAt(i)))
            {
			clean += word.charAt(i);
			}
     	}
   	return clean;
  	} // end of clean method

/****************************************************************
*Method:		hasSuffix										*
*Returns:		boolean											*
*Recievs:		String word, Strig suffix, NewString stem		*
*Purpose:		returns true if word has the suffix suffix and	*
*				nulls stem if (suffix is longer than word and	*
*				the suffix is more than 1 char and the second	*
*				to last char of suffix doesn't match with the 	*
*				second to last char of word)					*
****************************************************************/
private boolean hasSuffix( String word, String suffix)
	{
	String temp = "";

	// if suffix is longer than word return false
	if ( word.length() <= suffix.length() )
	    {
		return false;
		}

	//if the suffix is more than 1 char and the second to last char of the suffix
	//doesn't match with the second to last char of the the word then return false
	if ((suffix.length()>1) && (word.charAt(word.length()-2) !=suffix.charAt(suffix.length()-2)))
       	{
		return false;
		}

	//make stem = to word - suffix
	stemValue = word.substring(0,(word.length()-suffix.length()));

	//make tmp = to ste
	temp = stemValue;

	//add the suffix to tmp
	temp += suffix;

	//if the word is eaual to the word minus the length of the suffix +the suffix then
	//the suffix is part of the word if not is is not
	if (temp.compareTo(word) == 0 )
       	{
		return true;
		}
	else
       	{
		return false;
		}
	}

/****************************************************************
*Method:		vowel											*
*Returns:		boolean											*
*Recievs:		char ch, char prev								*
*Purpose:		determin whether ch is a vowel or not uses prev	*
*				determination when ch == y						*
****************************************************************/
private boolean vowel(char ch, char prev)
	{
    switch (ch)
		{
	    case 'a':
		case 'e':
		case 'i':
		case 'o':
		case 'u':
          	return true;
        case 'y':
			{
			switch ( prev )
				{
            	case 'a':
				case 'e':
				case 'i':
				case 'o':
				case 'u':
            		return false;
				default:
            		return true;
          		}
        	}
        default :
          return false;
     	}
	}


/************************************************************
*Method:		measure										*
*Returns:		int											*
*Recievs:		String stem									*
*Purpose:		counts the number of vowels followed by a  	*
*				constanant									*
************************************************************/
private int measure(String stem)
	{
    int count = 0;
    for (int i =0; i < stem.length()-1; i++)
		{
		if (i==0)
			{
			if (vowel(stem.charAt(i), 'a'))
			{
				if (!vowel(stem.charAt(i+1), stem.charAt(i)))
					{
					count++;
					}
				}
			}

		else if (vowel(stem.charAt(i), stem.charAt(i-1)))
			{
			if (!vowel(stem.charAt(i+1), stem.charAt(i)))
				{
				count++;
				}
			}
		}
    return(count);
    }



/************************************************************
*Method:		containsVowel								*
*Returns:		boolean										*
*Recievs:		String word									*
*Purpose	:	returns true if string contains a vowel		*
*															*
************************************************************/
private boolean containsVowel( String word )
	{
	for (int i=0 ; i < word.length(); i++ )
		{
		if ( i > 0 )
			{
            if ( vowel(word.charAt(i),word.charAt(i-1)) )
            	{
				return true;
				}
			}
         else
			{
            if ( vowel(word.charAt(0),'a') )
               	{
				return true;
         		}
			}
       	}
	return false;
	}

/****************************************************************
*Method:		cvc												*
*Returns:		boolean											*
*Recievs:		String s										*
*Purpose:		returns true if the last three letters of		*
*				string are constanant vowel constanant and the	*
*				final constanant is not a w,x or y				*
*****************************************************************/
private boolean cvc( String str )
	{
    int length=str.length();

    if ( length < 3 )
	   	{
		return false;
    	}

	if ( (!vowel(str.charAt(length-1),str.charAt(length-2)) )
	     && (str.charAt(length-1) != 'w')
	     && (str.charAt(length-1) != 'x')
	     && (str.charAt(length-1) != 'y')
	     && (vowel(str.charAt(length-2),str.charAt(length-3))) )
		{
		if (length == 3)
			{
   			if (!vowel(str.charAt(0),'?'))
   				{
				return true;
				}
   			else
   				{
				return false;
				}
   			}
   		else
			{
   			if(!vowel(str.charAt(length-3),str.charAt(length-4)) )
   				{
				return true;
				}
			else
   				{
				return false;
				}
   			}
  		}
   	return false;
  	}


/**************** Remove Suffixes ******************/

/****************************************************************
*Method:		stripSuffixes									*
*Returns:		String											*
*Recievs:		String str										*
*Purpose:		calls the five steps of suffix removal of the 	*
*				porter stemming algorithm returning the stem	*
****************************************************************/
private String stripSuffixes( String str )
	{
   	str = step1( str );
   	if ( str.length() >= 1 )
   		{
		str = step2( str );
	   	if ( str.length() >= 1 )
  	 		{
			str = step3( str );
			if ( str.length() >= 1 )
				{
				str = step4( str );
				if ( str.length() >= 1 )
					{
					str = step5( str );
					}
				}
			}
		}
   	return str;
  	}


/************************************************************
*Method:		step1										*
*Returns:		String										*
*Recievs:		String str									*
*Purpose:		performs step one of the porter stemming  	*
*				algorithm removing plaurals					*
************************************************************/
private String step1(String str)
	{
   	stemValue="";

/*Step1a
**************/
	if ( str.charAt( str.length()-1 ) == 's' )
		{
		if ( (hasSuffix( str, "sses")) || (hasSuffix( str,"ies")) )
			{
           	str = str.substring(0,str.length()-2);
        	}
        else
			{
        	if ( ( str.length() == 1 ) && (str.charAt(str.length()-1) == 's' ) )
				{
        		str = "";
        		return str;
        		}
        	if ( str.charAt( str.length()-2 ) != 's' )
				{
        		str= str.substring(0, (str.length()-1));
        		}
        	}
     	}

/*Step1b
**************/
   	if ( hasSuffix( str,"eed") )
		{
       	if ( measure( stemValue) > 0 )
			{
   			str= str.substring(0,(str.length()-1));
       		}
  		}
   	else
		{
  		if ((hasSuffix( str,"ed")) || (hasSuffix(str,"ing")) )
			{
   			if (containsVowel( stemValue ))
				{
  				str = str.substring(0, stemValue.length());
   				if ( str.length() == 1 )
	   				{
					return str;
					}
 				if (( hasSuffix( str,"at") ) || (hasSuffix( str,"bl") ) || ( hasSuffix( str,"iz") ) )
					{
       				str += "e";
   					}
   				else
					{
					int length = str.length();
					if ( (str.charAt(length-1) ==str.charAt(length-2)) && (str.charAt(length-1) != 'l') &&(str.charAt(length-1) != 's') && (str.charAt(length-1) != 'z') )
						{
       					str = str.substring(0,length-1);
       					}
       				else
	  					{
						if ( measure( str ) == 1 )
							{
	   						if ( cvc(str) )
      	  						{
								str += "e";
       							}
							}
						}
  					}
 				}
  			}
		}

/*Step1c
**************/
	if ( hasSuffix(str,"y") )
		{
		if ( containsVowel( stemValue ) )
			{
   			str= str.substring(0,str.length()-1);
			str += "i";
   			}
		}
   	return str;
  	}

/************************************************************
*Method:		step2										*
*Returns:		String										*
*Recievs:		String str									*
*Purpose:		performs step two of the porter stemming  	*
*				algorithm									*
************************************************************/
private String step2( String str )
	{
	stemValue="";

	for ( int index = 0 ; index < suffixes2.size(); index+=2 )
		{
		if ( hasSuffix ( str, (String) suffixes2.elementAt(index) ) )
			{
			if ( measure ( stemValue ) > 0 )
				{
				str = stemValue + (String) suffixes2.elementAt(index+1);
				return str;
				}
			}
		}
	return str;
	}

/************************************************************
*Method:		step3										*
*Returns:		String										*
*Recievs:		String str									*
*Purpose:		performs step three of the porter stemming 	*
*				algorithm									*
************************************************************/
private String step3( String str )
	{
	stemValue="";

	for ( int index = 0 ; index<suffixes3.size(); index+=2 )
		{
        if ( hasSuffix ( str, (String) suffixes3.elementAt(index)))
        	{
			if ( measure ( stemValue ) > 0 )
				{
        		str = stemValue + (String) suffixes3.elementAt(index+1);
        		return str;
        		}
			}
		}
	return str;
  	}

/************************************************************
*Method:		step4										*
*Returns:		String										*
*Recievs:		String str									*
*Purpose:		performs step four of the porter stemming  	*
*				algorithm									*
*************************************************************/
private String step4( String str )
	{
   	stemValue = "";


	for ( int index = 0 ; index<suffixes4.size(); index+=2 )
		{
	    if ( hasSuffix ( str, (String) suffixes4.elementAt(index)) )
			{
   			if ( measure ( stemValue ) > 1 )
				{
   				str = stemValue;
   				return str;
   				}
   			}
  		}
     	return str;
  	}

/************************************************************
*Method:		step5										*
*Returns:		String										*
*Recievs:		String str									*
*Purpose:		performs step five of the porter stemming  	*
*				algorithm									*
************************************************************/
private String step5( String str )
	{

/*Step5a
**************/
	if ( str.charAt(str.length()-1) == 'e' )
		{
   		if ( measure(str) > 1 )
   			{
			str= str.substring(0, str.length()-1);
  			}
   		else
  			{
			if (measure(str) == 1)
				{
   				String stem = str.substring(0,str.length()-1);
   				if (!cvc(stem))
       				{
					str = stem;
					}
				}
   			}
		}

/*Step5b
**************/
	if ( str.length() == 1 )
		{
		return str;
		}
	if ( (str.charAt(str.length()-1) == 'l') && (str.charAt(str.length()-2) == 'l') && (measure(str) > 1) )
		{
		str = str.substring(0,str.length()-1);
		}
	return str;
	}


/************************************************************
*Method:		stripPrefixes								*
*Returns:		String										*
*Recievs:		String str									*
*Purpose:		removes prefixes so that suffix removal can	*
*				comence										*
************************************************************/
private String stripPrefixes ( String str)
	{
	for ( int i=0 ; i<prefixes.size(); i+=2 )
		{
       	if ((str.startsWith((String) prefixes.elementAt(i)))&&(str.length() > ((String) prefixes.elementAt(i)).length()))
			{
   			str= str.substring(((String) prefixes.elementAt(i)).length());
			return str;
      		}
  		}
    return str;
  	}

/*************************** PUBLIC METHODS ****************/

/************************************************************
*Method:		stripAffixes								*
*Returns:		String										*
*Recievs:		String str									*
*Purpose:		prepares string and calls stripPrefixes and *
*				stripSuffixes								*
************************************************************/
public String stripAffixes( String str )
	{
	str = str.toLowerCase(); //change all letters in the input to lowercase
	str = Clean(str); // remove all chars from string that are not a letter or a digit (why digit?)
	if ( str.length() > 2) //if str's length is greater than 2 then remove prefixes
		{
		if (preStrip)
			{
			str = stripPrefixes(str);
			}

		if (str != "" ) // if str is not null remove suffix
			{
			str = stripSuffixes(str);
			}
		}
	return str;
    } //stripAffixes

//***************MAIN METHOD***************
/********************************************************
*Method:		main									*
*Returns:		void									*
*Recives:		String args[0] input file				*
*				String args[1] output file				*
*				String args[2] rules file				*
*				String args[3] prestripping				*
*Purpose:		creates instance of Porter and calles	*
*				stripAffixes							*
********************************************************/
public static void main(String args[] )
	{
	Porter p    = new Porter(args[2],args[3]);
	String text;
	StringTokenizer line = new StringTokenizer("");
	String filein = args[0];
	String fileout = args[1];
	try
		{
		FileWriter fw = new FileWriter(fileout);
		BufferedWriter bw = new BufferedWriter(fw);
		FileReader fr = new FileReader(filein);
		BufferedReader br = new BufferedReader(fr);
		try
			{
			while ((text=br.readLine())!= null)
				{
				line= new StringTokenizer(text);
				try
					{
					while (line.hasMoreTokens())
						{
						// read word from line and stem word
						String word = new String();
						word = line.nextToken();
						bw.write(p.stripAffixes(word)+" ");
						}
					bw.newLine();
					}
				catch (Exception e)
					{
					System.err.println(e);
					}
				}
			}
		catch(Exception e)
			{
			System.err.println("File Error Durring Reading"+e);
			System.exit(0);
			}
			// try to close file, file is not needed again so if can't close don't exit
		try
			{
			fr.close();
			}
		catch(Exception e)
			{
			System.err.println("Error Closing File During Reading");
			}
		try
			{
			bw.close();
			}
		catch(Exception e)
			{
			System.err.println(e);
			}
		}
	catch(Exception e)
		{
		System.err.println("Input File" + filein +"not found");
		System.exit(1);
		}


	}//main

} //class