public class ContractionsPTB
extends java.lang.Object
Modifier and Type | Field and Description |
---|---|
(package private) static java.lang.String[] |
contractionsStartingWithApostrophe |
private static java.lang.String[] |
fullWordsNotToBreakAtApostrophe |
(package private) static java.lang.String |
lettersAfterApostropheForMiddleOfContraction |
(package private) static int[] |
MultiTokenWordLenToken1 |
(package private) static int[] |
MultiTokenWordLenToken2 |
(package private) static int[] |
MultiTokenWordLenToken3 |
(package private) static java.lang.String[] |
MultiTokenWords |
(package private) static java.util.HashMap<java.lang.String,java.lang.Integer> |
MultiTokenWordsLookup |
(package private) static java.lang.String[] |
possibleContractionEndings |
Constructor and Description |
---|
ContractionsPTB() |
Modifier and Type | Method and Description |
---|---|
(package private) static boolean |
allDigits(java.lang.String s) |
(package private) static boolean |
breakAtApostrophe(java.lang.String s,
int positionOfApostropheToTest)
Assumes apostrophe is not first character....
|
(package private) static int |
getLenContractionToken(int currentPosition,
java.lang.String lowerCasedText) |
static ContractionResult |
getLengthIfNextApostIsMiddleOfContraction(int position,
int nextNonLetterDigit,
java.lang.String lowerCasedText)
Determine if the text starting at 'position' within 'text' is the start of a
contraction such as "should've" or "hasn't" or "it's" by looking at whether
there is a letter before the apostrophe, and the appropriate letters after the
apostrophe (or in the case of "n't", verify the letter before is an 'n'
Note that if the text starting at 'position' is something like "n't" which
isn't a complete word, returns null.
|
(package private) static boolean |
isContractionThatStartsWithApostrophe(int currentPosition,
java.lang.String lowerCasedText) |
(package private) static int |
lenOfFirstTokenInContraction(java.lang.String s) |
(package private) static int |
lenOfSecondTokenInContraction(java.lang.String s) |
(package private) static int |
lenOfThirdTokenInContraction(java.lang.String s) |
static void |
main(java.lang.String[] args) |
private static void |
test_getLengthIfNextApostIsMiddleOfContraction() |
(package private) static int |
tokenLengthCheckingForSingleQuoteWordsToKeepTogether(java.lang.String lowerCasedText)
for a word like 80's or P'yongyang or James' or Sean's or 80's-like or 80's-esque
(or can't or haven't, which are to be split)
determine whether the singlequote(apostrophe)
needs to be kept with the surrounding letters/numbers
and what to do about hyphenated afterwards if there is a hyphen after....
|
static java.lang.String[] MultiTokenWords
static int[] MultiTokenWordLenToken1
static int[] MultiTokenWordLenToken2
static int[] MultiTokenWordLenToken3
static java.util.HashMap<java.lang.String,java.lang.Integer> MultiTokenWordsLookup
static java.lang.String[] possibleContractionEndings
static java.lang.String lettersAfterApostropheForMiddleOfContraction
static java.lang.String[] contractionsStartingWithApostrophe
private static java.lang.String[] fullWordsNotToBreakAtApostrophe
private static void test_getLengthIfNextApostIsMiddleOfContraction()
public static ContractionResult getLengthIfNextApostIsMiddleOfContraction(int position, int nextNonLetterDigit, java.lang.String lowerCasedText)
position
- first char of next tokenlowerCasedText
- text into which parameter position is an index intofor handling contractions like "cannot" that don't have an apostrophe
static int getLenContractionToken(int currentPosition, java.lang.String lowerCasedText)
static int lenOfFirstTokenInContraction(java.lang.String s)
s
- isMiddleOfContraction
static int lenOfSecondTokenInContraction(java.lang.String s)
static int lenOfThirdTokenInContraction(java.lang.String s)
static boolean isContractionThatStartsWithApostrophe(int currentPosition, java.lang.String lowerCasedText)
static boolean breakAtApostrophe(java.lang.String s, int positionOfApostropheToTest)
s
is lower case.static boolean allDigits(java.lang.String s)
static int tokenLengthCheckingForSingleQuoteWordsToKeepTogether(java.lang.String lowerCasedText)
public static void main(java.lang.String[] args)