
/*
 *  PHEX - The pure-java Gnutella-servent.
 *  Copyright (C) 2000 William W. Wong
 *  williamw@jps.net
 *
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 2 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program; if not, write to the Free Software
 *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */


package phex.share;


import java.io.*;
import java.util.*;

import phex.interfaces.*;


public class IndexerTxt implements IFileIndexer
{
	private byte[]		mBuf = new byte[1024];
	protected String	mDelimiter = " ";
	protected String	mFilterChars = ".!?:;,(){}[]/\\";
	protected Hashtable	mFilterWords = new Hashtable();



	public IndexerTxt()
	{
		setupFileterWords();
	}


	protected void setupFileterWords()
	{
		mFilterWords.put("OF", "");
		mFilterWords.put("THE", "");
		mFilterWords.put("A", "");
		mFilterWords.put("AN", "");
		mFilterWords.put("OR", "");
		mFilterWords.put("AND", "");
		mFilterWords.put("AS", "");
		mFilterWords.put("IN", "");
		mFilterWords.put("ON", "");
		mFilterWords.put("INTO", "");
		mFilterWords.put("ONTO", "");
		mFilterWords.put("AT", "");
		mFilterWords.put("UNDER", "");
		mFilterWords.put("THIS", "");
		mFilterWords.put("THAT", "");
		mFilterWords.put("THESE", "");
		mFilterWords.put("THOSE", "");
		mFilterWords.put("IT", "");
		mFilterWords.put("I", "");
		mFilterWords.put("YOU", "");
		mFilterWords.put("HE", "");
		mFilterWords.put("SHE", "");
		mFilterWords.put("HIS", "");
		mFilterWords.put("HER", "");
		mFilterWords.put("HERS", "");
		mFilterWords.put("ME", "");
	}


	public Hashtable index(File file)
			throws Exception
	{
		Hashtable			words = new Hashtable();
		BufferedReader		br = new BufferedReader(new FileReader(file));
		String				line;


		while ((line = br.readLine()) != null)
		{
			StringTokenizer	tokens = new StringTokenizer(line, mDelimiter);
			while (tokens.hasMoreTokens())
			{
				String		word = tokens.nextToken().toUpperCase();

				word = filter(word);

				if (word != null)
					words.put(word, "");
			}
		}

		return words;
	}


	protected String filter(String word)
	{
		for (int i = 0; i < mFilterChars.length(); i++)
		{
			if (word.length() < 1)
				return null;

			if (word.charAt(word.length() - 1) == mFilterChars.charAt(i))
			{
				word = word.substring(0, word.length() - 1);
				i = -1;
				continue;
			}
			if (word.charAt(0) == mFilterChars.charAt(i))
			{
				word = word.substring(1);
				i = -1;
			}
		}

		if (mFilterWords.get(word) != null)
			return null;

		return word;
	}



	public static void main(String[] args)
	{
		try
		{
			IndexerTxt	indexer = new IndexerTxt();
			File		file = new File(args[0]);

			System.out.println(indexer.index(file));
		}
		catch(Exception e)
		{
			e.printStackTrace();
		}
	}


}


