/*
    OpenDocumentTextInputStream extracts raw text from an OpenDocument
	text file.
    Copyright (C) 2005  J. David Eisenberg

    This library is free software; you can redistribute it and/or
    modify it under the terms of the GNU Lesser General Public
    License as published by the Free Software Foundation; either
    version 2.1 of the License, or (at your option) any later version.

    This library is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
    Lesser General Public License for more details.

    You should have received a copy of the GNU Lesser General Public
    License along with this library; if not, write to the Free Software
    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
	
	Author: J. David Eisenberg
	Contact: catcode@catcode.com

*/
package com.catcode.odf;

import java.io.InputStream;
import java.io.IOException;

import java.util.ArrayList;
import java.util.Collections;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.io.FilterInputStream;

/**
 * OpenDocumentTextInputStream reads the content of an
 * OASIS Open Document Format text (word processing) file.
 * <p>
 * Limitations/restrictions:
 * </p>
 * <ul>
 * <li>The namespaces must all be in the root element.</li>
 * <li>No data is returned for embedded objects.</li>
 * <li>Will not properly handle XML comments that contain elements.</li>
 * </ul>
 *
 * <p>
 * You can set two lists of element names (using the
 * <code>OpenDocumentElement</code> class). The capture list is the
 * list of elements whose text you want; the omit list is the
 * list of elements within which text is never output.  The default
 * value for the capture list is <code>&lt;text:p&gt;</code> and
 * <code>&lt;text:h</code>. The default value for the omit list
 * is <code>&lt;text:tracked-changes&gt;</code>.
 * </p>
 *
 *	@author		J. David Eisenberg
 *	@version	0.1, 2005-10-16
*/

public class OpenDocumentTextInputStream extends FilterInputStream
{
	private StringBuffer tagBuffer;	// collect the tag
	
	private String textNamespace;	// the namespace prefix for <text:...>
	private static final Pattern elementNamePattern =
		Pattern.compile("^/?(?:([\\p{L}\\p{N}_.-]+):)?([\\p{L}\\p{N}_.-]+)");
	
	/*
	 * If the source file has a Unicode character whose value is
	 * >= 0x80, then we have to split it into several bytes and
	 * parcel them out one at time when read() is called.
	 * The utf8Ouput buffer is the holding area for those bytes.
	 */
	private int[] utf8Output;
	private int utf8OutputPosition;
	private int utf8OutputLength;
	
	/*
	 * We are interested only in text within "capture" elements.
	 * We also keep track of how deeply nested we are in 
	 * capture elements.
	 *
	 * The capture list must be kept in sorted order.
	 */
	private static ArrayList captureList;
	private int captureDepth;
	
	/*
	 * If we are insite an "omit" element, then we never
	 * output its text, even if we encounter a capture element
	 * inside.
	 *
	 * The omit list must be kept in sorted order.
	 */
	private static ArrayList omitList;
	private int omitDepth;

	private boolean rootElement;
	/*
	 * The standard five (and only recognized!) entities
	 * and their corresponding characters
	 */
	private static final String[] stdFiveEntities = {
		"apos", "quot", "lt", "gt", "amp"
	};
	static final byte[] stdFiveValues = {
		'\'', '"', '<', '>', '&'
	};
	
	/**
	 * Constructs an OASIS Open Document Text input stream.
	 *
	 * @param	in				the actual input stream
	 */
	public OpenDocumentTextInputStream( InputStream in )
	{
		this( in, null, null );
	}

	/**
	 * Constructs an OASIS Open Document Text input stream.
	 * This constructor lets you provide a list of "capture" elements
	 * whose content you wish to examine. and "omit" elements whose
	 * content will always be omitted. These lists <em>must</em>
	 * be sorted into Unicode order, since it will be searched with
	 * <code>binarySearch()</code>.
	 * <p>
	 * If you want an empty list for either one of these, pass in
	 * an empty <code>ArrayList</code>. Passing in <code>null</code>
	 * will set you up with the default capture or omit list.
	 * </p>
	 *
	 * @param	in			the actual input stream
	 * @param	capture		an <code>ArrayList</code> of
	 *						elements whose content will be
	 *						read by this stream
	 * @param	omit		An <code>ArrayList</code> of element
	 *						whose content will be ignored by ths stream.
	 */
	public OpenDocumentTextInputStream( InputStream in,
		ArrayList capture, ArrayList omit )
	{
		super( in );
		
		/* initialize variables */
		utf8Output = new int[4];
		utf8OutputPosition = 0;
		utf8OutputLength = 0;
		rootElement = true;
		
		if (capture == null)
		{
			captureList = new ArrayList(4);
			captureList.add( new ElementPostProcess( "h", '\n') );
			captureList.add( new ElementPostProcess( "p", '\n' ) );
			captureList.add( new ElementPostProcess( "tab", '\t' ) );
			captureList.add( new ElementPostProcess( "s", ' ') );
		}
		else
		{
			this.captureList = capture;
		}
		if (omit == null)
		{
			omitList = new ArrayList(1);
			omitList.add( new ElementPostProcess( "tracked-changes", '\0' ) );
		}
		else
		{
			this.omitList = omit;
		}
		captureDepth = 0;
		omitDepth = 0;
	}

	/**
	 * Reads the next byte of data from this input stream.
	 * The value byte is returned as an <code>int</code> in the range 0 to 255.
	 * If no byte is available because the end of the stream has been reached,
	 * the value -1 is returned. Only bytes within "relevant" elements (as
	 * listed in the <code>relevantElement</code> list) are returned.
	 * This method blocks until input data is available, the end of the stream
	 * is detected, or an exception is thrown.
	 *
	 *	@return	the next byte of data, or <code>-1</code>
	 *			if the end of the stream is reached.
	 *	@throws	IOException	if an I/O error occurs.
	 * 
	 */
	public int read( ) throws IOException
	{
		int theByte = 0;
		int result = 0;
		while (theByte == 0)
		{
			/*
				If we still have a UTF-8 sequence in progress, emit it.
			*/
			if (utf8OutputPosition < utf8OutputLength)
			{
				theByte = utf8Output[utf8OutputPosition++];
			}
			else
			{
				theByte = in.read( );
				if (theByte == '<')
				{
					collectTag();
					theByte = 0;
				}
				else if (theByte == '&')
				{
					collectEntity();
					theByte = 0;
				}
				else if ((omitDepth > 0 || captureDepth == 0) && theByte != -1)
				{
					theByte = 0;
				}
			}				
		}
		return theByte;
	}
	
    /**
     * Reads some number of bytes from the input stream and stores them into
     * the buffer array <code>b</code>. The number of bytes actually read is
     * returned as an integer.
	 */
	public int read(byte b[]) throws IOException
	{
		return read(b, 0, b.length);
    }

	/**
	 * Reads up to <code>len</code> bytes of data from the input stream into
     * an array of bytes. The number of bytes actually read is
	 * returned as an integer. See <code>InputStream</code> for details.
	 * In fact, this code is copied straight from that file.
	 */
	public int read(byte b[], int off, int len) throws IOException
	{
		if (b == null)
		{
			throw new NullPointerException();
		}
		else if ((off < 0) || (off > b.length) || (len < 0) ||
		   ((off + len) > b.length) || ((off + len) < 0))
		{
			throw new IndexOutOfBoundsException();
		}
		else if (len == 0)
		{
			return 0;
		}

		int c = read();

		if (c == -1)
		{
			return -1;
		}
		
		b[off] = (byte) c;
	
		int i = 1;
		try
		{
			for (; i < len ; i++)
			{
				c = read();
				if (c == -1)
				{
					break;
				}
				if (b != null)
				{
					b[off + i] = (byte) c;
				}
			}
		}
		catch (IOException ee)
		{
		}
		return i;
    }

     /**
     * Skips specified number of bytes in the current ODT file entry.
     * @param n the number of bytes to skip
     * @return the actual number of bytes skipped
     * @exception IOException if an I/O error has occurred
     * @exception IllegalArgumentException if n < 0
     */
    public long skip(long n) throws IOException
	{
		byte[] tmpbuf = new byte[512];
		long remaining = 0;

        if (n < 0)
		{
            throw new IllegalArgumentException("negative skip length");
        }
		int max = (int) Math.min(n, Integer.MAX_VALUE);
		int total = 0;
		while (total < max)
		{
			int len = max - total;
			if (len > tmpbuf.length)
			{
				len = tmpbuf.length;
			}
			len = read(tmpbuf, 0, len);
			if (len == -1)
			{
				break;
			}
			total += len;
		}
		return total;
    }

	/**
	 * Collect all characters up to and including the ending semicolon
	 * of the entity.
	 *
	 * Accepts entities in form &#nnn; &#xnnn; &alpha;, but checks to see
	 * that alpha entities are only the "big five".
	 * <p>
	 * This method will fill the <code>utf8Output[]</code> array,
	 * set <code>utf8OutputLength</code> appropriately, and
	 * set <code>utf8OutputPosition</code> to zero.
	 * </p>
	 * <p>
	 * If we hit the end of file, put <code>-1</code> in the utf8 buffer;
	 * the main loop in <code>read()</code> will emit it the next time through.
	 * </p>
	 *
	 * @throws	IOException	if I/O error occurs while reading bytes.
	 */
	protected void collectEntity( ) throws IOException
	{
		StringBuffer strBuf = new StringBuffer(10);
		String entityString;
		int entityValue;
		int b;
		int i;	// ubiquitous counter
		
		b = super.read();
		while (b != ';' && b != -1)
		{
			if (b != -1)
			{
				strBuf.append( (char) b );
				b = super.read();
			}
		}
		if (b != -1)
		{
			if (strBuf.charAt(0) == '#')
			{
				/* numeric entity; leading "x" means hex */
				entityString = strBuf.substring(1).toLowerCase();
				if (entityString.startsWith("x"))
				{
					entityString = entityString.substring(1);
					entityValue = Integer.parseInt( entityString, 16 );
				}
				else
				{
					entityValue = Integer.parseInt( entityString, 10 );
				}
				createUTF8Output( entityValue );
			}
			else
			{
				/* alphabetic entity */
				entityString = strBuf.toString();
				for (i=0; i < stdFiveEntities.length &&
					!entityString.equals( stdFiveEntities[i] ); i++)
					// do nothing; all action is in the loop count
					;
				if (i == stdFiveEntities.length)
				{
					throw new IllegalArgumentException( "Unknown entity &"
						+ entityString + ";" );
				}
				utf8Output[0] = stdFiveValues[i];
				utf8OutputLength = 1;
			}
		}
		else
		{
			utf8Output[0] = -1;
			utf8OutputLength = 1;
		}
		utf8OutputPosition = 0;
	}
	
	/**
	 * Split a Unicode value into UTF-8 bytes.
	 * Puts bytes into <code>utf8Output[]</code> and sets the
	 * <code>utf8OutputLength</code> appropriately.
	 */
	protected void createUTF8Output( int value )
	{
		/*
		 *   Char. number range  |        UTF-8 octet sequence
		 *      (hexadecimal)    |              (binary)
		 *   --------------------+----------------------------------
		 *   0000 0000-0000 007F | 0xxxxxxx
		 *   0000 0080-0000 07FF | 110xxxxx 10xxxxxx
		 *   0000 0800-0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
		 *   0001 0000-0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
		 */
		if (value > 0x10ffff)
		{
			throw new IllegalArgumentException(
				value + " outside Unicode range."
			);
		}
		if (value <= 0x7f)
		{
			utf8Output[0] = value & 0x7f;
			utf8OutputLength = 1;
		}
		else if (value <= 0x7ff)
		{
			utf8Output[0] = 0xc0 | ((value >> 6) & 0x1f);
			utf8Output[1] = 0x80 | (value & 0x3f);
			utf8OutputLength = 2;
		}
		else if (value <= 0xffff)
		{
			utf8Output[0] = 0xe0 | ((value >> 12) & 0xf);
			utf8Output[1] = 0x80 | ((value >> 6) & 0x3f);
			utf8Output[2] = 0x80 | (value  & 0x3f);
			utf8OutputLength = 3;
		}
		else
		{
			utf8Output[0] = 0xf0 | ((value >> 18) & 0x7);
			utf8Output[1] = 0x80 | ((value >> 12) & 0x3f);
			utf8Output[2] = 0x80 | ((value >> 6) & 0x3f);
			utf8Output[3] = 0x80 | (value & 0x3f);
			utf8OutputLength = 4;
		}
	}
	
	/**
	 * Collects information between angle brackets into a string buffer.
	 *
	 * <p>
	 * Reads from file until encountering a &gt; symbol.  If a byte
	 * has a value greater than 127, then call <code>collectUTF8()</code>
	 * to combine it and the following bytes into a Unicode character.
	 * </p>
	 * <p>
	 * If we hit the end of file, put <code>-1</code> in the utf8 buffer;
	 * the main loop in <code>read()</code> will emit it the next time through.
	 * @throws	IOException	if I/O error occurs while reading bytes.
	 * </p>
	 */
	protected void collectTag() throws IOException
	{
		int b = 0;
		int nUTF8;

		tagBuffer = new StringBuffer(50);
		b = super.read();
		while (b != '>' && b != -1)
		{
			if (b > 127)
			{
				b = collectUTF8( b );
			}
			/* replace whitespace characters with blanks */
			if (b == 0x09 || b == 0x0a || b == 0x0d || b == 0x0085
				|| b == 0x2028 || b == 0x2029)
			{
				b = 0x20;
			}
			tagBuffer.append( (char) b );
			b = super.read();
		}
		if (b != -1)
		{
			analyzeTag( tagBuffer.toString() );
		}
		else
		{
			utf8Output[0] = -1;
			utf8OutputLength = 1;
			utf8OutputPosition = 0;
		}
	}

	/**
	 * Create a UTF-8 character from individual bytes.
	 *
	 * @param startByte the starting byte of a UTF-8 sequence.
	 * @return a UTF-8 character.
	 */
	protected int collectUTF8( int startByte ) throws IOException
	{
		int highBits = (startByte >> 4) & 0x0f;
		int nUTF8;
		int[] utf8Buf = new int[4];
		int oneByte = 0;
		int result;
		int i;
		
		utf8Buf[0] = startByte;
		if (highBits == 12 || highBits == 13)
		{
			nUTF8 = 1;
		}
		else if (highBits == 14)
		{
			nUTF8 = 2;
		}
		else
		{
			nUTF8 = 3;
		}
		for (i=0; i < nUTF8 && oneByte != -1; i++)
		{
			oneByte = super.read();
			if (oneByte != -1)
			{
				utf8Buf[i+1] = oneByte;
			}
		}
		if (oneByte != -1)
		{
			result = 0;
			switch (highBits)
			{
				case 12:
				case 13:
					result = ((utf8Buf[0] & 0x1f) << 6)
						| (utf8Buf[1] & 0x3f);
					break;
				case 14:
					result = ((utf8Buf[0] & 0x0f) << 12)
						| ((utf8Buf[1] & 0x3f) << 6)
						| (utf8Buf[2] & 0x3f);
					break;
				case 15:
					result = ((utf8Buf[0] & 0x07) << 18)
						| ((utf8Buf[1] & 0x3f) << 12)
						| ((utf8Buf[2] & 0x3f) << 6)
						| (utf8Buf[3] & 0x3f);
					break;			
			}
		}
		else
		{
			result = -1;
		}
		return result;
	}
	
	/**
	 * Set flags to accept or reject characters in this tag.
	 *
	 * @param tag the tag to be analyzed
	 */
	protected void analyzeTag( String tag )
	{
		Matcher m;
		String prefix;
		String name;
		boolean isOpeningTag;
		boolean isClosingTag;
		int position;

		if ( !tag.startsWith("!") && !tag.startsWith( "?" ) )
		{
			m = elementNamePattern.matcher( tag );
			if (m.find())
			{
				prefix = m.group(1);
				name = m.group(2);
				
				/*
				 * If this is the root element, it will have the text
				 * namespace in it
				 */
				if (rootElement && !tag.startsWith("?") &&
					!tag.startsWith("!"))
				{
					Pattern textURI =
					Pattern.compile("xmlns:?([\\p{L}\\p{N}_.-]*)\\s*=\\s*" +
						"\"urn:oasis:names:tc:opendocument:xmlns:text:1.0\"");
					m = textURI.matcher( tag );
					if (m.find())
					{
						textNamespace = m.group(1);
					}
					else
					{
						throw new IllegalArgumentException(
							"Cannot find namespace for text"
						);
					}
					rootElement = false;
				}
				
				isOpeningTag = !tag.startsWith("/");
				isClosingTag = tag.startsWith("/") || tag.endsWith("/");
				if (prefix.equals(textNamespace))
				{
					position = findTag( omitList, name );
					if (position >= 0)
					{
						if (isOpeningTag)
						{
							omitDepth++;
						}
						if (isClosingTag)
						{
							omitDepth--;
						}
					}

					position = findTag( captureList, name );
					if (position >= 0)
					{
						ElementPostProcess elementInfo =
							(ElementPostProcess) captureList.get(position);
						if (isOpeningTag)
						{
							captureDepth++;
						}
						if (isClosingTag)
						{
							if ( elementInfo.getPostProcess() != '\0' &&
								omitDepth == 0)
							{
								utf8Output[0] = elementInfo.getPostProcess();
								utf8OutputLength = 1;
								utf8OutputPosition = 0;
							}
							captureDepth--;
						}
					}
				}
			}
			else
			{
				throw new IllegalArgumentException( "Unknown tag <" +
					tag + ">");
			}
		}
	}
	
	/**
	 * Locates a tag name within a list of <code>ElementPostProcess</code>.
	 *
	 * @param list an ArrayList of ElementPostProcess objects.
	 * @param name the name to search for.
	 * @return the position in the list, or -1 if not found.
	 */
	private int findTag( ArrayList list, String name )
	{
		int result = -1;
		int i = 0;
		while (i < list.size() && result == -1)
		{
			if (((ElementPostProcess)list.get(i)).getName().equals(name))
			{
				result = i;
			}
			else
			{
				i++;
			}
		}
		return result;
	}
}

