degu.util.enchancer
Class PDFTermPositionExctractor

java.lang.Object
  extended by org.pdfbox.util.PDFStreamEngine
      extended by org.pdfbox.util.PDFTextStripper
          extended by degu.util.enchancer.PDFTermPositionExctractor

public class PDFTermPositionExctractor
extends org.pdfbox.util.PDFTextStripper


Field Summary
private  int curPage
           
private  net.sf.snowball.ext.EnglishStemmer englishStemmer
           
private static org.apache.log4j.Logger logger
           
private  org.pdfbox.pdmodel.PDDocument pdDocument
           
private  java.util.Vector<TermPositionInPDF> positions
           
private  java.util.HashMap<java.lang.String,java.lang.Boolean> terms
           
 
Fields inherited from class org.pdfbox.util.PDFTextStripper
charactersByArticle, output
 
Constructor Summary
PDFTermPositionExctractor(org.pdfbox.pdmodel.PDDocument pdDocument, java.util.HashMap<java.lang.String,java.lang.Boolean> terms)
           
 
Method Summary
 java.util.Vector<TermPositionInPDF> searchPositions()
           
 void setPDDcocument(org.pdfbox.pdmodel.PDDocument pdDocument)
           
 void setTerms(java.util.HashMap<java.lang.String,java.lang.Boolean> terms)
           
protected  void showCharacter(org.pdfbox.util.TextPosition text)
          A method provided as an event interface to allow a subclass to perform some specific functionality when a character needs to be displayed.
 
Methods inherited from class org.pdfbox.util.PDFTextStripper
endDocument, endPage, endParagraph, flushText, getCharactersByArticle, getCurrentPageNo, getEndBookmark, getEndPage, getLineSeparator, getOutput, getPageSeparator, getStartBookmark, getStartPage, getText, getText, getWordSeparator, processPage, processPages, setEndBookmark, setEndPage, setLineSeparator, setPageSeparator, setShouldSeparateByBeads, setSortByPosition, setStartBookmark, setStartPage, setSuppressDuplicateOverlappingText, setWordSeparator, shouldSeparateByBeads, shouldSortByPosition, shouldSuppressDuplicateOverlappingText, startDocument, startPage, startParagraph, writeCharacters, writeText, writeText
 
Methods inherited from class org.pdfbox.util.PDFStreamEngine
getColorSpaces, getCurrentPage, getFonts, getGraphicsStack, getGraphicsState, getGraphicsStates, getResources, getTextLineMatrix, getTextMatrix, getXObjects, processOperator, processOperator, processStream, processSubStream, setColorSpaces, setFonts, setGraphicsStack, setGraphicsState, setGraphicsStates, setTextLineMatrix, setTextMatrix, showString
 
Methods inherited from class java.lang.Object
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait
 

Field Detail

logger

private static org.apache.log4j.Logger logger

englishStemmer

private net.sf.snowball.ext.EnglishStemmer englishStemmer

pdDocument

private org.pdfbox.pdmodel.PDDocument pdDocument

terms

private java.util.HashMap<java.lang.String,java.lang.Boolean> terms

positions

private java.util.Vector<TermPositionInPDF> positions

curPage

private int curPage
Constructor Detail

PDFTermPositionExctractor

public PDFTermPositionExctractor(org.pdfbox.pdmodel.PDDocument pdDocument,
                                 java.util.HashMap<java.lang.String,java.lang.Boolean> terms)
                          throws java.io.IOException,
                                 org.pdfbox.exceptions.CryptographyException,
                                 org.pdfbox.exceptions.InvalidPasswordException
Throws:
java.io.IOException
org.pdfbox.exceptions.CryptographyException
org.pdfbox.exceptions.InvalidPasswordException
Method Detail

setPDDcocument

public void setPDDcocument(org.pdfbox.pdmodel.PDDocument pdDocument)
                    throws java.io.IOException,
                           org.pdfbox.exceptions.CryptographyException,
                           org.pdfbox.exceptions.InvalidPasswordException
Throws:
java.io.IOException
org.pdfbox.exceptions.CryptographyException
org.pdfbox.exceptions.InvalidPasswordException

setTerms

public void setTerms(java.util.HashMap<java.lang.String,java.lang.Boolean> terms)

searchPositions

public java.util.Vector<TermPositionInPDF> searchPositions()
                                                    throws java.io.IOException
Throws:
java.io.IOException

showCharacter

protected void showCharacter(org.pdfbox.util.TextPosition text)
A method provided as an event interface to allow a subclass to perform some specific functionality when a character needs to be displayed.

Overrides:
showCharacter in class org.pdfbox.util.PDFTextStripper
Parameters:
text - The character to be displayed.