001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017package org.apache.commons.text; 018 019import java.util.ArrayList; 020import java.util.Collections; 021import java.util.List; 022import java.util.ListIterator; 023import java.util.NoSuchElementException; 024 025import org.apache.commons.text.matcher.StringMatcher; 026import org.apache.commons.text.matcher.StringMatcherFactory; 027 028/** 029 * Tokenizes a string based on delimiters (separators) and supporting quoting and ignored character concepts. 030 * <p> 031 * This class can split a String into many smaller strings. It aims to do a similar job to 032 * {@link java.util.StringTokenizer StringTokenizer}, however it offers much more control and flexibility including 033 * implementing the <code>ListIterator</code> interface. By default, it is set up like <code>StringTokenizer</code>. 034 * <p> 035 * The input String is split into a number of <i>tokens</i>. Each token is separated from the next String by a 036 * <i>delimiter</i>. One or more delimiter characters must be specified. 037 * <p> 038 * Each token may be surrounded by quotes. The <i>quote</i> matcher specifies the quote character(s). A quote may be 039 * escaped within a quoted section by duplicating itself. 040 * <p> 041 * Between each token and the delimiter are potentially characters that need trimming. The <i>trimmer</i> matcher 042 * specifies these characters. One usage might be to trim whitespace characters. 043 * <p> 044 * At any point outside the quotes there might potentially be invalid characters. The <i>ignored</i> matcher specifies 045 * these characters to be removed. One usage might be to remove new line characters. 046 * <p> 047 * Empty tokens may be removed or returned as null. 048 * 049 * <pre> 050 * "a,b,c" - Three tokens "a","b","c" (comma delimiter) 051 * " a, b , c " - Three tokens "a","b","c" (default CSV processing trims whitespace) 052 * "a, ", b ,", c" - Three tokens "a, " , " b ", ", c" (quoted text untouched) 053 * </pre> 054 * 055 * <table> 056 * <caption>StringTokenizer properties and options</caption> 057 * <tr> 058 * <th>Property</th> 059 * <th>Type</th> 060 * <th>Default</th> 061 * </tr> 062 * <tr> 063 * <td>delim</td> 064 * <td>CharSetMatcher</td> 065 * <td>{ \t\n\r\f}</td> 066 * </tr> 067 * <tr> 068 * <td>quote</td> 069 * <td>NoneMatcher</td> 070 * <td>{}</td> 071 * </tr> 072 * <tr> 073 * <td>ignore</td> 074 * <td>NoneMatcher</td> 075 * <td>{}</td> 076 * </tr> 077 * <tr> 078 * <td>emptyTokenAsNull</td> 079 * <td>boolean</td> 080 * <td>false</td> 081 * </tr> 082 * <tr> 083 * <td>ignoreEmptyTokens</td> 084 * <td>boolean</td> 085 * <td>true</td> 086 * </tr> 087 * </table> 088 * 089 * @since 1.3 090 */ 091public class StringTokenizer implements ListIterator<String>, Cloneable { 092 093 /** Comma separated values tokenizer internal variable. */ 094 private static final StringTokenizer CSV_TOKENIZER_PROTOTYPE; 095 /** Tab separated values tokenizer internal variable. */ 096 private static final StringTokenizer TSV_TOKENIZER_PROTOTYPE; 097 static { 098 CSV_TOKENIZER_PROTOTYPE = new StringTokenizer(); 099 CSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StringMatcherFactory.INSTANCE.commaMatcher()); 100 CSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StringMatcherFactory.INSTANCE.doubleQuoteMatcher()); 101 CSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StringMatcherFactory.INSTANCE.noneMatcher()); 102 CSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StringMatcherFactory.INSTANCE.trimMatcher()); 103 CSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false); 104 CSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false); 105 106 TSV_TOKENIZER_PROTOTYPE = new StringTokenizer(); 107 TSV_TOKENIZER_PROTOTYPE.setDelimiterMatcher(StringMatcherFactory.INSTANCE.tabMatcher()); 108 TSV_TOKENIZER_PROTOTYPE.setQuoteMatcher(StringMatcherFactory.INSTANCE.doubleQuoteMatcher()); 109 TSV_TOKENIZER_PROTOTYPE.setIgnoredMatcher(StringMatcherFactory.INSTANCE.noneMatcher()); 110 TSV_TOKENIZER_PROTOTYPE.setTrimmerMatcher(StringMatcherFactory.INSTANCE.trimMatcher()); 111 TSV_TOKENIZER_PROTOTYPE.setEmptyTokenAsNull(false); 112 TSV_TOKENIZER_PROTOTYPE.setIgnoreEmptyTokens(false); 113 } 114 115 /** The text to work on. */ 116 private char[] chars; 117 /** The parsed tokens. */ 118 private String[] tokens; 119 /** The current iteration position. */ 120 private int tokenPos; 121 122 /** The delimiter matcher. */ 123 private StringMatcher delimMatcher = StringMatcherFactory.INSTANCE.splitMatcher(); 124 /** The quote matcher. */ 125 private StringMatcher quoteMatcher = StringMatcherFactory.INSTANCE.noneMatcher(); 126 /** The ignored matcher. */ 127 private StringMatcher ignoredMatcher = StringMatcherFactory.INSTANCE.noneMatcher(); 128 /** The trimmer matcher. */ 129 private StringMatcher trimmerMatcher = StringMatcherFactory.INSTANCE.noneMatcher(); 130 131 /** Whether to return empty tokens as null. */ 132 private boolean emptyAsNull = false; 133 /** Whether to ignore empty tokens. */ 134 private boolean ignoreEmptyTokens = true; 135 136 // ----------------------------------------------------------------------- 137 138 /** 139 * Returns a clone of <code>CSV_TOKENIZER_PROTOTYPE</code>. 140 * 141 * @return a clone of <code>CSV_TOKENIZER_PROTOTYPE</code>. 142 */ 143 private static StringTokenizer getCSVClone() { 144 return (StringTokenizer) CSV_TOKENIZER_PROTOTYPE.clone(); 145 } 146 147 /** 148 * Gets a new tokenizer instance which parses Comma Separated Value strings initializing it with the given input. 149 * The default for CSV processing will be trim whitespace from both ends (which can be overridden with the 150 * setTrimmer method). 151 * <p> 152 * You must call a "reset" method to set the string which you want to parse. 153 * 154 * @return a new tokenizer instance which parses Comma Separated Value strings 155 */ 156 public static StringTokenizer getCSVInstance() { 157 return getCSVClone(); 158 } 159 160 /** 161 * Gets a new tokenizer instance which parses Comma Separated Value strings initializing it with the given input. 162 * The default for CSV processing will be trim whitespace from both ends (which can be overridden with the 163 * setTrimmer method). 164 * 165 * @param input 166 * the text to parse 167 * @return a new tokenizer instance which parses Comma Separated Value strings 168 */ 169 public static StringTokenizer getCSVInstance(final String input) { 170 final StringTokenizer tok = getCSVClone(); 171 tok.reset(input); 172 return tok; 173 } 174 175 /** 176 * Gets a new tokenizer instance which parses Comma Separated Value strings initializing it with the given input. 177 * The default for CSV processing will be trim whitespace from both ends (which can be overridden with the 178 * setTrimmer method). 179 * 180 * @param input 181 * the text to parse 182 * @return a new tokenizer instance which parses Comma Separated Value strings 183 */ 184 public static StringTokenizer getCSVInstance(final char[] input) { 185 final StringTokenizer tok = getCSVClone(); 186 tok.reset(input); 187 return tok; 188 } 189 190 /** 191 * Returns a clone of <code>TSV_TOKENIZER_PROTOTYPE</code>. 192 * 193 * @return a clone of <code>TSV_TOKENIZER_PROTOTYPE</code>. 194 */ 195 private static StringTokenizer getTSVClone() { 196 return (StringTokenizer) TSV_TOKENIZER_PROTOTYPE.clone(); 197 } 198 199 /** 200 * Gets a new tokenizer instance which parses Tab Separated Value strings. The default for CSV processing will be 201 * trim whitespace from both ends (which can be overridden with the setTrimmer method). 202 * <p> 203 * You must call a "reset" method to set the string which you want to parse. 204 * 205 * @return a new tokenizer instance which parses Tab Separated Value strings. 206 */ 207 public static StringTokenizer getTSVInstance() { 208 return getTSVClone(); 209 } 210 211 /** 212 * Gets a new tokenizer instance which parses Tab Separated Value strings. The default for CSV processing will be 213 * trim whitespace from both ends (which can be overridden with the setTrimmer method). 214 * 215 * @param input 216 * the string to parse 217 * @return a new tokenizer instance which parses Tab Separated Value strings. 218 */ 219 public static StringTokenizer getTSVInstance(final String input) { 220 final StringTokenizer tok = getTSVClone(); 221 tok.reset(input); 222 return tok; 223 } 224 225 /** 226 * Gets a new tokenizer instance which parses Tab Separated Value strings. The default for CSV processing will be 227 * trim whitespace from both ends (which can be overridden with the setTrimmer method). 228 * 229 * @param input 230 * the string to parse 231 * @return a new tokenizer instance which parses Tab Separated Value strings. 232 */ 233 public static StringTokenizer getTSVInstance(final char[] input) { 234 final StringTokenizer tok = getTSVClone(); 235 tok.reset(input); 236 return tok; 237 } 238 239 // ----------------------------------------------------------------------- 240 /** 241 * Constructs a tokenizer splitting on space, tab, newline and form feed as per StringTokenizer, but with no text to 242 * tokenize. 243 * <p> 244 * This constructor is normally used with {@link #reset(String)}. 245 */ 246 public StringTokenizer() { 247 super(); 248 this.chars = null; 249 } 250 251 /** 252 * Constructs a tokenizer splitting on space, tab, newline and form feed as per StringTokenizer. 253 * 254 * @param input 255 * the string which is to be parsed 256 */ 257 public StringTokenizer(final String input) { 258 super(); 259 if (input != null) { 260 chars = input.toCharArray(); 261 } else { 262 chars = null; 263 } 264 } 265 266 /** 267 * Constructs a tokenizer splitting on the specified delimiter character. 268 * 269 * @param input 270 * the string which is to be parsed 271 * @param delim 272 * the field delimiter character 273 */ 274 public StringTokenizer(final String input, final char delim) { 275 this(input); 276 setDelimiterChar(delim); 277 } 278 279 /** 280 * Constructs a tokenizer splitting on the specified delimiter string. 281 * 282 * @param input 283 * the string which is to be parsed 284 * @param delim 285 * the field delimiter string 286 */ 287 public StringTokenizer(final String input, final String delim) { 288 this(input); 289 setDelimiterString(delim); 290 } 291 292 /** 293 * Constructs a tokenizer splitting using the specified delimiter matcher. 294 * 295 * @param input 296 * the string which is to be parsed 297 * @param delim 298 * the field delimiter matcher 299 */ 300 public StringTokenizer(final String input, final StringMatcher delim) { 301 this(input); 302 setDelimiterMatcher(delim); 303 } 304 305 /** 306 * Constructs a tokenizer splitting on the specified delimiter character and handling quotes using the specified 307 * quote character. 308 * 309 * @param input 310 * the string which is to be parsed 311 * @param delim 312 * the field delimiter character 313 * @param quote 314 * the field quoted string character 315 */ 316 public StringTokenizer(final String input, final char delim, final char quote) { 317 this(input, delim); 318 setQuoteChar(quote); 319 } 320 321 /** 322 * Constructs a tokenizer splitting using the specified delimiter matcher and handling quotes using the specified 323 * quote matcher. 324 * 325 * @param input 326 * the string which is to be parsed 327 * @param delim 328 * the field delimiter matcher 329 * @param quote 330 * the field quoted string matcher 331 */ 332 public StringTokenizer(final String input, final StringMatcher delim, final StringMatcher quote) { 333 this(input, delim); 334 setQuoteMatcher(quote); 335 } 336 337 /** 338 * Constructs a tokenizer splitting on space, tab, newline and form feed as per StringTokenizer. 339 * 340 * @param input 341 * the string which is to be parsed, not cloned 342 */ 343 public StringTokenizer(final char[] input) { 344 super(); 345 if (input == null) { 346 this.chars = null; 347 } else { 348 this.chars = input.clone(); 349 } 350 } 351 352 /** 353 * Constructs a tokenizer splitting on the specified character. 354 * 355 * @param input 356 * the string which is to be parsed, not cloned 357 * @param delim 358 * the field delimiter character 359 */ 360 public StringTokenizer(final char[] input, final char delim) { 361 this(input); 362 setDelimiterChar(delim); 363 } 364 365 /** 366 * Constructs a tokenizer splitting on the specified string. 367 * 368 * @param input 369 * the string which is to be parsed, not cloned 370 * @param delim 371 * the field delimiter string 372 */ 373 public StringTokenizer(final char[] input, final String delim) { 374 this(input); 375 setDelimiterString(delim); 376 } 377 378 /** 379 * Constructs a tokenizer splitting using the specified delimiter matcher. 380 * 381 * @param input 382 * the string which is to be parsed, not cloned 383 * @param delim 384 * the field delimiter matcher 385 */ 386 public StringTokenizer(final char[] input, final StringMatcher delim) { 387 this(input); 388 setDelimiterMatcher(delim); 389 } 390 391 /** 392 * Constructs a tokenizer splitting on the specified delimiter character and handling quotes using the specified 393 * quote character. 394 * 395 * @param input 396 * the string which is to be parsed, not cloned 397 * @param delim 398 * the field delimiter character 399 * @param quote 400 * the field quoted string character 401 */ 402 public StringTokenizer(final char[] input, final char delim, final char quote) { 403 this(input, delim); 404 setQuoteChar(quote); 405 } 406 407 /** 408 * Constructs a tokenizer splitting using the specified delimiter matcher and handling quotes using the specified 409 * quote matcher. 410 * 411 * @param input 412 * the string which is to be parsed, not cloned 413 * @param delim 414 * the field delimiter character 415 * @param quote 416 * the field quoted string character 417 */ 418 public StringTokenizer(final char[] input, final StringMatcher delim, final StringMatcher quote) { 419 this(input, delim); 420 setQuoteMatcher(quote); 421 } 422 423 // API 424 // ----------------------------------------------------------------------- 425 /** 426 * Gets the number of tokens found in the String. 427 * 428 * @return the number of matched tokens 429 */ 430 public int size() { 431 checkTokenized(); 432 return tokens.length; 433 } 434 435 /** 436 * Gets the next token from the String. Equivalent to {@link #next()} except it returns null rather than throwing 437 * {@link NoSuchElementException} when no tokens remain. 438 * 439 * @return the next sequential token, or null when no more tokens are found 440 */ 441 public String nextToken() { 442 if (hasNext()) { 443 return tokens[tokenPos++]; 444 } 445 return null; 446 } 447 448 /** 449 * Gets the previous token from the String. 450 * 451 * @return the previous sequential token, or null when no more tokens are found 452 */ 453 public String previousToken() { 454 if (hasPrevious()) { 455 return tokens[--tokenPos]; 456 } 457 return null; 458 } 459 460 /** 461 * Gets a copy of the full token list as an independent modifiable array. 462 * 463 * @return the tokens as a String array 464 */ 465 public String[] getTokenArray() { 466 checkTokenized(); 467 return tokens.clone(); 468 } 469 470 /** 471 * Gets a copy of the full token list as an independent modifiable list. 472 * 473 * @return the tokens as a String array 474 */ 475 public List<String> getTokenList() { 476 checkTokenized(); 477 final List<String> list = new ArrayList<>(tokens.length); 478 Collections.addAll(list, tokens); 479 480 return list; 481 } 482 483 /** 484 * Resets this tokenizer, forgetting all parsing and iteration already completed. 485 * <p> 486 * This method allows the same tokenizer to be reused for the same String. 487 * 488 * @return this, to enable chaining 489 */ 490 public StringTokenizer reset() { 491 tokenPos = 0; 492 tokens = null; 493 return this; 494 } 495 496 /** 497 * Reset this tokenizer, giving it a new input string to parse. In this manner you can re-use a tokenizer with the 498 * same settings on multiple input lines. 499 * 500 * @param input 501 * the new string to tokenize, null sets no text to parse 502 * @return this, to enable chaining 503 */ 504 public StringTokenizer reset(final String input) { 505 reset(); 506 if (input != null) { 507 this.chars = input.toCharArray(); 508 } else { 509 this.chars = null; 510 } 511 return this; 512 } 513 514 /** 515 * Reset this tokenizer, giving it a new input string to parse. In this manner you can re-use a tokenizer with the 516 * same settings on multiple input lines. 517 * 518 * @param input 519 * the new character array to tokenize, not cloned, null sets no text to parse 520 * @return this, to enable chaining 521 */ 522 public StringTokenizer reset(final char[] input) { 523 reset(); 524 if (input != null) { 525 this.chars = input.clone(); 526 } else { 527 this.chars = null; 528 } 529 return this; 530 } 531 532 // ListIterator 533 // ----------------------------------------------------------------------- 534 /** 535 * Checks whether there are any more tokens. 536 * 537 * @return true if there are more tokens 538 */ 539 @Override 540 public boolean hasNext() { 541 checkTokenized(); 542 return tokenPos < tokens.length; 543 } 544 545 /** 546 * Gets the next token. 547 * 548 * @return the next String token 549 * @throws NoSuchElementException 550 * if there are no more elements 551 */ 552 @Override 553 public String next() { 554 if (hasNext()) { 555 return tokens[tokenPos++]; 556 } 557 throw new NoSuchElementException(); 558 } 559 560 /** 561 * Gets the index of the next token to return. 562 * 563 * @return the next token index 564 */ 565 @Override 566 public int nextIndex() { 567 return tokenPos; 568 } 569 570 /** 571 * Checks whether there are any previous tokens that can be iterated to. 572 * 573 * @return true if there are previous tokens 574 */ 575 @Override 576 public boolean hasPrevious() { 577 checkTokenized(); 578 return tokenPos > 0; 579 } 580 581 /** 582 * Gets the token previous to the last returned token. 583 * 584 * @return the previous token 585 */ 586 @Override 587 public String previous() { 588 if (hasPrevious()) { 589 return tokens[--tokenPos]; 590 } 591 throw new NoSuchElementException(); 592 } 593 594 /** 595 * Gets the index of the previous token. 596 * 597 * @return the previous token index 598 */ 599 @Override 600 public int previousIndex() { 601 return tokenPos - 1; 602 } 603 604 /** 605 * Unsupported ListIterator operation. 606 * 607 * @throws UnsupportedOperationException 608 * always 609 */ 610 @Override 611 public void remove() { 612 throw new UnsupportedOperationException("remove() is unsupported"); 613 } 614 615 /** 616 * Unsupported ListIterator operation. 617 * 618 * @param obj 619 * this parameter ignored. 620 * @throws UnsupportedOperationException 621 * always 622 */ 623 @Override 624 public void set(final String obj) { 625 throw new UnsupportedOperationException("set() is unsupported"); 626 } 627 628 /** 629 * Unsupported ListIterator operation. 630 * 631 * @param obj 632 * this parameter ignored. 633 * @throws UnsupportedOperationException 634 * always 635 */ 636 @Override 637 public void add(final String obj) { 638 throw new UnsupportedOperationException("add() is unsupported"); 639 } 640 641 // Implementation 642 // ----------------------------------------------------------------------- 643 /** 644 * Checks if tokenization has been done, and if not then do it. 645 */ 646 private void checkTokenized() { 647 if (tokens == null) { 648 if (chars == null) { 649 // still call tokenize as subclass may do some work 650 final List<String> split = tokenize(null, 0, 0); 651 tokens = split.toArray(new String[split.size()]); 652 } else { 653 final List<String> split = tokenize(chars, 0, chars.length); 654 tokens = split.toArray(new String[split.size()]); 655 } 656 } 657 } 658 659 /** 660 * Internal method to performs the tokenization. 661 * <p> 662 * Most users of this class do not need to call this method. This method will be called automatically by other 663 * (public) methods when required. 664 * <p> 665 * This method exists to allow subclasses to add code before or after the tokenization. For example, a subclass 666 * could alter the character array, offset or count to be parsed, or call the tokenizer multiple times on multiple 667 * strings. It is also be possible to filter the results. 668 * <p> 669 * <code>StrTokenizer</code> will always pass a zero offset and a count equal to the length of the array to this 670 * method, however a subclass may pass other values, or even an entirely different array. 671 * 672 * @param srcChars 673 * the character array being tokenized, may be null 674 * @param offset 675 * the start position within the character array, must be valid 676 * @param count 677 * the number of characters to tokenize, must be valid 678 * @return the modifiable list of String tokens, unmodifiable if null array or zero count 679 */ 680 protected List<String> tokenize(final char[] srcChars, final int offset, final int count) { 681 if (srcChars == null || count == 0) { 682 return Collections.emptyList(); 683 } 684 final TextStringBuilder buf = new TextStringBuilder(); 685 final List<String> tokenList = new ArrayList<>(); 686 int pos = offset; 687 688 // loop around the entire buffer 689 while (pos >= 0 && pos < count) { 690 // find next token 691 pos = readNextToken(srcChars, pos, count, buf, tokenList); 692 693 // handle case where end of string is a delimiter 694 if (pos >= count) { 695 addToken(tokenList, ""); 696 } 697 } 698 return tokenList; 699 } 700 701 /** 702 * Adds a token to a list, paying attention to the parameters we've set. 703 * 704 * @param list 705 * the list to add to 706 * @param tok 707 * the token to add 708 */ 709 private void addToken(final List<String> list, String tok) { 710 if (tok == null || tok.length() == 0) { 711 if (isIgnoreEmptyTokens()) { 712 return; 713 } 714 if (isEmptyTokenAsNull()) { 715 tok = null; 716 } 717 } 718 list.add(tok); 719 } 720 721 /** 722 * Reads character by character through the String to get the next token. 723 * 724 * @param srcChars 725 * the character array being tokenized 726 * @param start 727 * the first character of field 728 * @param len 729 * the length of the character array being tokenized 730 * @param workArea 731 * a temporary work area 732 * @param tokenList 733 * the list of parsed tokens 734 * @return the starting position of the next field (the character immediately after the delimiter), or -1 if end of 735 * string found 736 */ 737 private int readNextToken(final char[] srcChars, int start, final int len, final TextStringBuilder workArea, 738 final List<String> tokenList) { 739 // skip all leading whitespace, unless it is the 740 // field delimiter or the quote character 741 while (start < len) { 742 final int removeLen = Math.max(getIgnoredMatcher().isMatch(srcChars, start, start, len), 743 getTrimmerMatcher().isMatch(srcChars, start, start, len)); 744 if (removeLen == 0 || getDelimiterMatcher().isMatch(srcChars, start, start, len) > 0 745 || getQuoteMatcher().isMatch(srcChars, start, start, len) > 0) { 746 break; 747 } 748 start += removeLen; 749 } 750 751 // handle reaching end 752 if (start >= len) { 753 addToken(tokenList, ""); 754 return -1; 755 } 756 757 // handle empty token 758 final int delimLen = getDelimiterMatcher().isMatch(srcChars, start, start, len); 759 if (delimLen > 0) { 760 addToken(tokenList, ""); 761 return start + delimLen; 762 } 763 764 // handle found token 765 final int quoteLen = getQuoteMatcher().isMatch(srcChars, start, start, len); 766 if (quoteLen > 0) { 767 return readWithQuotes(srcChars, start + quoteLen, len, workArea, tokenList, start, quoteLen); 768 } 769 return readWithQuotes(srcChars, start, len, workArea, tokenList, 0, 0); 770 } 771 772 /** 773 * Reads a possibly quoted string token. 774 * 775 * @param srcChars 776 * the character array being tokenized 777 * @param start 778 * the first character of field 779 * @param len 780 * the length of the character array being tokenized 781 * @param workArea 782 * a temporary work area 783 * @param tokenList 784 * the list of parsed tokens 785 * @param quoteStart 786 * the start position of the matched quote, 0 if no quoting 787 * @param quoteLen 788 * the length of the matched quote, 0 if no quoting 789 * @return the starting position of the next field (the character immediately after the delimiter, or if end of 790 * string found, then the length of string 791 */ 792 private int readWithQuotes(final char[] srcChars, final int start, final int len, final TextStringBuilder workArea, 793 final List<String> tokenList, final int quoteStart, final int quoteLen) { 794 // Loop until we've found the end of the quoted 795 // string or the end of the input 796 workArea.clear(); 797 int pos = start; 798 boolean quoting = quoteLen > 0; 799 int trimStart = 0; 800 801 while (pos < len) { 802 // quoting mode can occur several times throughout a string 803 // we must switch between quoting and non-quoting until we 804 // encounter a non-quoted delimiter, or end of string 805 if (quoting) { 806 // In quoting mode 807 808 // If we've found a quote character, see if it's 809 // followed by a second quote. If so, then we need 810 // to actually put the quote character into the token 811 // rather than end the token. 812 if (isQuote(srcChars, pos, len, quoteStart, quoteLen)) { 813 if (isQuote(srcChars, pos + quoteLen, len, quoteStart, quoteLen)) { 814 // matched pair of quotes, thus an escaped quote 815 workArea.append(srcChars, pos, quoteLen); 816 pos += quoteLen * 2; 817 trimStart = workArea.size(); 818 continue; 819 } 820 821 // end of quoting 822 quoting = false; 823 pos += quoteLen; 824 continue; 825 } 826 827 // copy regular character from inside quotes 828 workArea.append(srcChars[pos++]); 829 trimStart = workArea.size(); 830 831 } else { 832 // Not in quoting mode 833 834 // check for delimiter, and thus end of token 835 final int delimLen = getDelimiterMatcher().isMatch(srcChars, pos, start, len); 836 if (delimLen > 0) { 837 // return condition when end of token found 838 addToken(tokenList, workArea.substring(0, trimStart)); 839 return pos + delimLen; 840 } 841 842 // check for quote, and thus back into quoting mode 843 if (quoteLen > 0 && isQuote(srcChars, pos, len, quoteStart, quoteLen)) { 844 quoting = true; 845 pos += quoteLen; 846 continue; 847 } 848 849 // check for ignored (outside quotes), and ignore 850 final int ignoredLen = getIgnoredMatcher().isMatch(srcChars, pos, start, len); 851 if (ignoredLen > 0) { 852 pos += ignoredLen; 853 continue; 854 } 855 856 // check for trimmed character 857 // don't yet know if its at the end, so copy to workArea 858 // use trimStart to keep track of trim at the end 859 final int trimmedLen = getTrimmerMatcher().isMatch(srcChars, pos, start, len); 860 if (trimmedLen > 0) { 861 workArea.append(srcChars, pos, trimmedLen); 862 pos += trimmedLen; 863 continue; 864 } 865 866 // copy regular character from outside quotes 867 workArea.append(srcChars[pos++]); 868 trimStart = workArea.size(); 869 } 870 } 871 872 // return condition when end of string found 873 addToken(tokenList, workArea.substring(0, trimStart)); 874 return -1; 875 } 876 877 /** 878 * Checks if the characters at the index specified match the quote already matched in readNextToken(). 879 * 880 * @param srcChars 881 * the character array being tokenized 882 * @param pos 883 * the position to check for a quote 884 * @param len 885 * the length of the character array being tokenized 886 * @param quoteStart 887 * the start position of the matched quote, 0 if no quoting 888 * @param quoteLen 889 * the length of the matched quote, 0 if no quoting 890 * @return true if a quote is matched 891 */ 892 private boolean isQuote(final char[] srcChars, final int pos, final int len, final int quoteStart, 893 final int quoteLen) { 894 for (int i = 0; i < quoteLen; i++) { 895 if (pos + i >= len || srcChars[pos + i] != srcChars[quoteStart + i]) { 896 return false; 897 } 898 } 899 return true; 900 } 901 902 // Delimiter 903 // ----------------------------------------------------------------------- 904 /** 905 * Gets the field delimiter matcher. 906 * 907 * @return the delimiter matcher in use 908 */ 909 public StringMatcher getDelimiterMatcher() { 910 return this.delimMatcher; 911 } 912 913 /** 914 * Sets the field delimiter matcher. 915 * <p> 916 * The delimiter is used to separate one token from another. 917 * 918 * @param delim 919 * the delimiter matcher to use 920 * @return this, to enable chaining 921 */ 922 public StringTokenizer setDelimiterMatcher(final StringMatcher delim) { 923 if (delim == null) { 924 this.delimMatcher = StringMatcherFactory.INSTANCE.noneMatcher(); 925 } else { 926 this.delimMatcher = delim; 927 } 928 return this; 929 } 930 931 /** 932 * Sets the field delimiter character. 933 * 934 * @param delim 935 * the delimiter character to use 936 * @return this, to enable chaining 937 */ 938 public StringTokenizer setDelimiterChar(final char delim) { 939 return setDelimiterMatcher(StringMatcherFactory.INSTANCE.charMatcher(delim)); 940 } 941 942 /** 943 * Sets the field delimiter string. 944 * 945 * @param delim 946 * the delimiter string to use 947 * @return this, to enable chaining 948 */ 949 public StringTokenizer setDelimiterString(final String delim) { 950 return setDelimiterMatcher(StringMatcherFactory.INSTANCE.stringMatcher(delim)); 951 } 952 953 // Quote 954 // ----------------------------------------------------------------------- 955 /** 956 * Gets the quote matcher currently in use. 957 * <p> 958 * The quote character is used to wrap data between the tokens. This enables delimiters to be entered as data. The 959 * default value is '"' (double quote). 960 * 961 * @return the quote matcher in use 962 */ 963 public StringMatcher getQuoteMatcher() { 964 return quoteMatcher; 965 } 966 967 /** 968 * Set the quote matcher to use. 969 * <p> 970 * The quote character is used to wrap data between the tokens. This enables delimiters to be entered as data. 971 * 972 * @param quote 973 * the quote matcher to use, null ignored 974 * @return this, to enable chaining 975 */ 976 public StringTokenizer setQuoteMatcher(final StringMatcher quote) { 977 if (quote != null) { 978 this.quoteMatcher = quote; 979 } 980 return this; 981 } 982 983 /** 984 * Sets the quote character to use. 985 * <p> 986 * The quote character is used to wrap data between the tokens. This enables delimiters to be entered as data. 987 * 988 * @param quote 989 * the quote character to use 990 * @return this, to enable chaining 991 */ 992 public StringTokenizer setQuoteChar(final char quote) { 993 return setQuoteMatcher(StringMatcherFactory.INSTANCE.charMatcher(quote)); 994 } 995 996 // Ignored 997 // ----------------------------------------------------------------------- 998 /** 999 * Gets the ignored character matcher. 1000 * <p> 1001 * These characters are ignored when parsing the String, unless they are within a quoted region. The default value 1002 * is not to ignore anything. 1003 * 1004 * @return the ignored matcher in use 1005 */ 1006 public StringMatcher getIgnoredMatcher() { 1007 return ignoredMatcher; 1008 } 1009 1010 /** 1011 * Set the matcher for characters to ignore. 1012 * <p> 1013 * These characters are ignored when parsing the String, unless they are within a quoted region. 1014 * 1015 * @param ignored 1016 * the ignored matcher to use, null ignored 1017 * @return this, to enable chaining 1018 */ 1019 public StringTokenizer setIgnoredMatcher(final StringMatcher ignored) { 1020 if (ignored != null) { 1021 this.ignoredMatcher = ignored; 1022 } 1023 return this; 1024 } 1025 1026 /** 1027 * Set the character to ignore. 1028 * <p> 1029 * This character is ignored when parsing the String, unless it is within a quoted region. 1030 * 1031 * @param ignored 1032 * the ignored character to use 1033 * @return this, to enable chaining 1034 */ 1035 public StringTokenizer setIgnoredChar(final char ignored) { 1036 return setIgnoredMatcher(StringMatcherFactory.INSTANCE.charMatcher(ignored)); 1037 } 1038 1039 // Trimmer 1040 // ----------------------------------------------------------------------- 1041 /** 1042 * Gets the trimmer character matcher. 1043 * <p> 1044 * These characters are trimmed off on each side of the delimiter until the token or quote is found. The default 1045 * value is not to trim anything. 1046 * 1047 * @return the trimmer matcher in use 1048 */ 1049 public StringMatcher getTrimmerMatcher() { 1050 return trimmerMatcher; 1051 } 1052 1053 /** 1054 * Sets the matcher for characters to trim. 1055 * <p> 1056 * These characters are trimmed off on each side of the delimiter until the token or quote is found. 1057 * 1058 * @param trimmer 1059 * the trimmer matcher to use, null ignored 1060 * @return this, to enable chaining 1061 */ 1062 public StringTokenizer setTrimmerMatcher(final StringMatcher trimmer) { 1063 if (trimmer != null) { 1064 this.trimmerMatcher = trimmer; 1065 } 1066 return this; 1067 } 1068 1069 // ----------------------------------------------------------------------- 1070 /** 1071 * Gets whether the tokenizer currently returns empty tokens as null. The default for this property is false. 1072 * 1073 * @return true if empty tokens are returned as null 1074 */ 1075 public boolean isEmptyTokenAsNull() { 1076 return this.emptyAsNull; 1077 } 1078 1079 /** 1080 * Sets whether the tokenizer should return empty tokens as null. The default for this property is false. 1081 * 1082 * @param emptyAsNull 1083 * whether empty tokens are returned as null 1084 * @return this, to enable chaining 1085 */ 1086 public StringTokenizer setEmptyTokenAsNull(final boolean emptyAsNull) { 1087 this.emptyAsNull = emptyAsNull; 1088 return this; 1089 } 1090 1091 // ----------------------------------------------------------------------- 1092 /** 1093 * Gets whether the tokenizer currently ignores empty tokens. The default for this property is true. 1094 * 1095 * @return true if empty tokens are not returned 1096 */ 1097 public boolean isIgnoreEmptyTokens() { 1098 return ignoreEmptyTokens; 1099 } 1100 1101 /** 1102 * Sets whether the tokenizer should ignore and not return empty tokens. The default for this property is true. 1103 * 1104 * @param ignoreEmptyTokens 1105 * whether empty tokens are not returned 1106 * @return this, to enable chaining 1107 */ 1108 public StringTokenizer setIgnoreEmptyTokens(final boolean ignoreEmptyTokens) { 1109 this.ignoreEmptyTokens = ignoreEmptyTokens; 1110 return this; 1111 } 1112 1113 // ----------------------------------------------------------------------- 1114 /** 1115 * Gets the String content that the tokenizer is parsing. 1116 * 1117 * @return the string content being parsed 1118 */ 1119 public String getContent() { 1120 if (chars == null) { 1121 return null; 1122 } 1123 return new String(chars); 1124 } 1125 1126 // ----------------------------------------------------------------------- 1127 /** 1128 * Creates a new instance of this Tokenizer. The new instance is reset so that it will be at the start of the token 1129 * list. If a {@link CloneNotSupportedException} is caught, return <code>null</code>. 1130 * 1131 * @return a new instance of this Tokenizer which has been reset. 1132 */ 1133 @Override 1134 public Object clone() { 1135 try { 1136 return cloneReset(); 1137 } catch (final CloneNotSupportedException ex) { 1138 return null; 1139 } 1140 } 1141 1142 /** 1143 * Creates a new instance of this Tokenizer. The new instance is reset so that it will be at the start of the token 1144 * list. 1145 * 1146 * @return a new instance of this Tokenizer which has been reset. 1147 * @throws CloneNotSupportedException 1148 * if there is a problem cloning 1149 */ 1150 Object cloneReset() throws CloneNotSupportedException { 1151 // this method exists to enable 100% test coverage 1152 final StringTokenizer cloned = (StringTokenizer) super.clone(); 1153 if (cloned.chars != null) { 1154 cloned.chars = cloned.chars.clone(); 1155 } 1156 cloned.reset(); 1157 return cloned; 1158 } 1159 1160 // ----------------------------------------------------------------------- 1161 /** 1162 * Gets the String content that the tokenizer is parsing. 1163 * 1164 * @return the string content being parsed 1165 */ 1166 @Override 1167 public String toString() { 1168 if (tokens == null) { 1169 return "StringTokenizer[not tokenized yet]"; 1170 } 1171 return "StringTokenizer" + getTokenList(); 1172 } 1173 1174}