public class TokenizerPTB
extends java.lang.Object
Modifier and Type | Field and Description |
---|---|
private static char |
DASH |
private static java.lang.String |
ellipsis |
(package private) static java.lang.String[] |
emptyStringList |
(package private) static java.util.ArrayList<BaseToken> |
emptyTokenList |
(package private) static java.lang.String[] |
nameStartingWithApostrophe |
private java.lang.String |
possibleFinalPunctuation |
(package private) static java.lang.String[] |
testsForEmailAddress |
(package private) static java.lang.String[] |
testsForNumbers |
private static java.lang.String[] |
urlStarters |
private java.lang.String |
validOtherEmailAddressCharacters |
Constructor and Description |
---|
TokenizerPTB()
Constructor
|
Modifier and Type | Method and Description |
---|---|
private int |
checkFormat2(java.lang.String s) |
private boolean |
containsLetter(java.lang.String lowerCasedText,
int currentPosition,
int tokenLen) |
private java.lang.Object |
createToken(java.lang.Class<? extends BaseToken> clas,
java.lang.String s,
JCas jcas,
int begin,
int end,
int offsetAdjustment)
if clas is null, determine token class for the caller
if jcas is null,
|
private java.lang.Class<? extends BaseToken> |
determineTokenType(java.lang.String s,
int begin,
int end) |
int |
findFirstCharOfNextToken(java.lang.String s,
int startPosition) |
private int |
getLengthIfIsNumberThatStartsWithPeriod(int currentPosition,
java.lang.String textSegment) |
private int |
getLengthIfNameStartingWithApostrophe(int currentPosition,
java.lang.String textSegment) |
private int |
getLenToNextNonDigit(java.lang.String s,
int startingPosition) |
private boolean |
isContraction(char c) |
private boolean |
isEllipsis(int currentPosition,
java.lang.String textSegment) |
private boolean |
isEndOfLine(char c) |
private boolean |
isNumericChar(char ch)
",.0123456789"
|
private boolean |
isPossibleFinalPunctuation(char c) |
private boolean |
isTelephoneNumberChar(char ch)
"0123456789-"
|
private int |
lenIfIsAbbreviation(int currentPosition,
java.lang.String mixedCaseText,
int afterEndOfInputToConsider)
Assumes no white space between currentPosition and endOfInputToConsider
If last of a sentence is a period, then don't include the period with the abbreviation,
count it as punctuation.
|
private int |
lenIfIsEmailAddress(int currentPosition,
java.lang.String lowerCasedText,
int endOfInputToConsider)
Assumes no white space between currentPosition and endOfInputToConsider
|
private int |
lenIfIsNumberContainingComma(int currentPosition,
java.lang.String text,
int nextNonNumericChar)
such as -4,012.67 or 5 or 5.5 or 4,000,153
|
private int |
lenIfIsPostalCode(int currentPosition,
java.lang.String text,
int nextNonPostalCodeChar) |
private int |
lenIfIsTelephoneNumber(int currentPosition,
java.lang.String text,
int nextNonTelephoneNumberChar) |
private int |
lenIfIsUrl(int currentPosition,
java.lang.String lowerCasedText,
int endOfInputToConsider) |
static void |
main(java.lang.String[] args) |
(package private) static void |
runEmailTests() |
(package private) static void |
runNumberTests() |
private void |
setCapitalization(WordToken wta,
java.lang.String tokenText) |
private void |
setNumPosition(WordToken wta,
java.lang.String tokenText) |
private void |
setNumType(NumToken nta,
java.lang.String tokenText) |
java.util.List<?> |
tokenize(java.lang.String text)
Tokenize a string that is assumed to be the entire document (or at least to start at 0)
|
java.util.List<?> |
tokenizeTextSegment(JCas jcas,
java.lang.String textSegment,
int offsetAdjustment,
boolean includeTextNotJustOffsets)
Tokenize text that starts at offset offsetAdjustment within the complete text
|
private boolean |
verify(int begin,
int end,
int offsetAdjustment) |
private java.lang.Class<? extends BaseToken> |
wordTokenOrNumToken(java.lang.String lowerCasedText,
int currentPosition,
int tokenLen) |
static final java.lang.String[] emptyStringList
static final java.util.ArrayList<BaseToken> emptyTokenList
private static char DASH
private static java.lang.String ellipsis
static java.lang.String[] nameStartingWithApostrophe
private java.lang.String possibleFinalPunctuation
private java.lang.String validOtherEmailAddressCharacters
private static java.lang.String[] urlStarters
static java.lang.String[] testsForNumbers
static java.lang.String[] testsForEmailAddress
public java.util.List<?> tokenizeTextSegment(JCas jcas, java.lang.String textSegment, int offsetAdjustment, boolean includeTextNotJustOffsets)
textSegment
- the text to tokenizeoffsetAdjustment
- what to add to all offsets within textSegment to make them be offsets from the start of the text for the jcasincludeTextNotJustOffsets
- whether to copy the text covered by this token into the token object itselfpublic java.util.List<?> tokenize(java.lang.String text)
text
- the String to tokenizeprivate int lenIfIsNumberContainingComma(int currentPosition, java.lang.String text, int nextNonNumericChar)
currentPosition
- text
- nextNonNumericChar
- private int lenIfIsPostalCode(int currentPosition, java.lang.String text, int nextNonPostalCodeChar)
private int lenIfIsTelephoneNumber(int currentPosition, java.lang.String text, int nextNonTelephoneNumberChar)
private int checkFormat2(java.lang.String s)
private boolean isTelephoneNumberChar(char ch)
ch
- private boolean isNumericChar(char ch)
ch
- private int getLenToNextNonDigit(java.lang.String s, int startingPosition)
private java.lang.Class<? extends BaseToken> wordTokenOrNumToken(java.lang.String lowerCasedText, int currentPosition, int tokenLen)
private boolean containsLetter(java.lang.String lowerCasedText, int currentPosition, int tokenLen)
lowerCasedText
- currentPosition
- tokenLen
- private boolean isEllipsis(int currentPosition, java.lang.String textSegment)
private int getLengthIfNameStartingWithApostrophe(int currentPosition, java.lang.String textSegment)
private int getLengthIfIsNumberThatStartsWithPeriod(int currentPosition, java.lang.String textSegment)
private int lenIfIsAbbreviation(int currentPosition, java.lang.String mixedCaseText, int afterEndOfInputToConsider)
currentPosition
- mixedCaseText
- afterEndOfInputToConsider
- private boolean isPossibleFinalPunctuation(char c)
private int lenIfIsEmailAddress(int currentPosition, java.lang.String lowerCasedText, int endOfInputToConsider)
currentPosition
- lowerCasedText
- endOfInputToConsider
- private int lenIfIsUrl(int currentPosition, java.lang.String lowerCasedText, int endOfInputToConsider)
private java.lang.Class<? extends BaseToken> determineTokenType(java.lang.String s, int begin, int end)
private boolean isContraction(char c)
private boolean verify(int begin, int end, int offsetAdjustment)
private java.lang.Object createToken(java.lang.Class<? extends BaseToken> clas, java.lang.String s, JCas jcas, int begin, int end, int offsetAdjustment)
org.apache.ctakes.core.ae.TokenConverter#convert(org.apache.ctakes.core.nlp.tokenizer.Token, org.apache.uima.jcas.JCas, int)
private void setNumType(NumToken nta, java.lang.String tokenText)
Tokenizer.isNumber(java.lang.String)
private void setNumPosition(WordToken wta, java.lang.String tokenText)
private void setCapitalization(WordToken wta, java.lang.String tokenText)
public int findFirstCharOfNextToken(java.lang.String s, int startPosition)
private boolean isEndOfLine(char c)
public static void main(java.lang.String[] args)
static void runNumberTests()
static void runEmailTests()