public class Tokenizer
extends java.lang.Object
Modifier and Type | Field and Description |
---|---|
private int |
iv_freqCutoff |
private java.util.Map<java.lang.String,java.lang.Integer> |
iv_hyphMap |
private OffsetComparator |
iv_offsetComp |
Constructor and Description |
---|
Tokenizer()
Constructor
|
Tokenizer(java.util.Map<java.lang.String,java.lang.Integer> hyphMap,
int freqCutoff)
Constructor
|
Modifier and Type | Method and Description |
---|---|
private void |
applyCapitalizationRules(Token token,
java.lang.String tokenText)
Applies capitalization rules to the given token.
|
private void |
applyPunctSymbolRules(java.util.List<Token> tokens,
java.lang.String text)
Applies punctuation/symbol rules to the given list of tokens.
|
private void |
applyWordNumRules(Token token,
java.lang.String tokenText) |
private boolean |
findPunctSymbolInsideToken(java.util.List<Token> tokens,
Token token,
java.lang.String tokenText)
Finds punctuation/symbols located inside a token.
|
private java.util.List<Token> |
getEndOfLineTokens(java.lang.String text)
Gets a list of tokens that mark end of a line.
|
private int |
getFirstInsidePunctSymbol(java.lang.String tokenText) |
private java.util.List<Token> |
getRawTokens(java.lang.String text)
Text is split based on whitespace into raw tokens.
|
boolean |
isAlphabetLetter(char c) |
private boolean |
isAlphabetLetterOrDigit(char c) |
private boolean |
isDigit(char c) |
private boolean |
isInteger(java.lang.String tokenText)
Given that the token text is a number, this method will determine if the
number is an integer or not.
|
static boolean |
isNumber(java.lang.String tokenText)
Applies number rules to the given token.
|
private boolean |
isPunctuation(char c) |
private int |
processEndPunctSymbol(java.util.List<Token> newTokenList,
Token token,
java.lang.String tokenText) |
private int |
processStartPunctSymbol(java.util.List<Token> newTokenList,
Token token,
java.lang.String tokenText) |
java.util.List<Token> |
tokenize(java.lang.String text)
Tokenizes a string of text and outputs a list of Token objects.
|
java.util.List<Token> |
tokenizeAndSort(java.lang.String text)
Tokenizes a string of text and outputs a list of Token objects in sorted
order.
|
static void |
validateHyphenMap(java.util.Map<java.lang.String,java.lang.Integer> hyphMap)
Validate the structure of the hyphen map.
|
private OffsetComparator iv_offsetComp
private java.util.Map<java.lang.String,java.lang.Integer> iv_hyphMap
private int iv_freqCutoff
public Tokenizer()
public Tokenizer(java.util.Map<java.lang.String,java.lang.Integer> hyphMap, int freqCutoff)
hyphMap
- Map where key=hyphenated string (lower cased) value=freq
IntegerfreqCutoff
- frequency cutoffpublic static void validateHyphenMap(java.util.Map<java.lang.String,java.lang.Integer> hyphMap) throws java.lang.Exception
java.lang.Exception
public java.util.List<Token> tokenizeAndSort(java.lang.String text) throws java.lang.Exception
text
- The text to tokenize.java.lang.Exception
- Thrown if an error occurs while tokenizing.public java.util.List<Token> tokenize(java.lang.String text) throws java.lang.Exception
text
- The text to tokenize.java.lang.Exception
private void applyPunctSymbolRules(java.util.List<Token> tokens, java.lang.String text)
tokens
- List of tokens to apply rules to.text
- The original text.private int processStartPunctSymbol(java.util.List<Token> newTokenList, Token token, java.lang.String tokenText)
private int processEndPunctSymbol(java.util.List<Token> newTokenList, Token token, java.lang.String tokenText)
private int getFirstInsidePunctSymbol(java.lang.String tokenText)
private boolean findPunctSymbolInsideToken(java.util.List<Token> tokens, Token token, java.lang.String tokenText)
tokens
- token
- tokenText
- private boolean isPunctuation(char c)
private boolean isAlphabetLetterOrDigit(char c)
public boolean isAlphabetLetter(char c)
private boolean isDigit(char c)
public static boolean isNumber(java.lang.String tokenText)
private boolean isInteger(java.lang.String tokenText)
tokenText
- private void applyCapitalizationRules(Token token, java.lang.String tokenText)
token
- tokenText
- private void applyWordNumRules(Token token, java.lang.String tokenText)
private java.util.List<Token> getEndOfLineTokens(java.lang.String text)
text
- private java.util.List<Token> getRawTokens(java.lang.String text)
text
-