001 package com.saelist.stx.parser;
002
003 import java.util.*;
004 import org.apache.log4j.*;
005
006 /**
007 * Tokenizes free form structured text into @{link Token}s.
008 */
009 public class Tokenizer {
010
011 public static Logger logger = Logger.getLogger("com.saelist.stx.parser.Tokenizer");
012
013 public static List tokenize(String text) { // a[b] c[ 'defg' ]
014
015 List tokens = new ArrayList(); // Change to linked list for efficiency.
016
017 String delimiters = "'\"[] \t\n\r";
018 int line = 0; // Counting from 0.
019 int column = 0; // Counting from 0.
020 int tokenColumn = 0;
021 int tokenLine = 0;
022 int tokenStart = Integer.MAX_VALUE;
023 boolean textAvailable = false;
024 for(int i = 0; i < text.length(); i++) {
025
026 char ch = text.charAt(i);
027 if(delimiters.indexOf(ch) >= 0) {
028
029 if(textAvailable)
030 tokens.add(new Token(tokenLine, tokenColumn, Token.TEXT, text.substring(tokenStart, i)));
031
032 tokens.add(new Token(line, column, ch, "" + ch));
033
034 textAvailable = false;
035
036 } else if(! textAvailable) {
037 tokenStart = i;
038 tokenColumn = column;
039 tokenLine = line;
040 textAvailable = true;
041 }
042
043 if(ch == '\n') {
044 line ++;
045 column = 0;
046 } else
047 column++;
048
049 }
050 return tokens;
051 }
052
053 /** Create a new token with the same column, line and type as the start
054 token but with the concatenation of all token values between start
055 and stop as value.
056 */
057 static Token join(List tokens, int start, int stop) {
058 StringBuffer buffer = new StringBuffer();
059 for(int i = start + 1; i <= stop - 1; i++)
060 buffer.append(((Token) tokens.get(i)).getValue());
061 Token startToken = (Token) tokens.get(start);
062 logger.debug("join(): buffer=" + buffer.toString());
063 return new Token(startToken.getLine(), startToken.getColumn(), startToken.getType(), buffer.toString());
064 }
065
066 /** Reconstructs quoted text and removes unquoted whitespace.
067 E.g the three tokens "'", "def", "'" are joined in "'abc'"
068 */
069 public static List assemble(List tokens) {
070 int inquote = 0; // 0 = not in quote, SQ = in single quote, DQ = in double quote
071 int quoteStart = 0;
072 int i = 0;
073 List assembly = new ArrayList(); // Change to linked list for efficiency.
074 for(Iterator it = tokens.iterator(); it.hasNext(); i++ ) {
075 Token token = (Token) it.next();
076 int type = token.getType();
077 if(inquote == 0) {
078 if(type == Token.SQ || type == Token.DQ) {
079 inquote = type;
080 quoteStart = i;
081 } else if(type != Token.SPACE && type != Token.NEWLINE && type != Token.CR && type != Token.TAB)
082 assembly.add(token);
083 } else if(inquote == Token.SQ) {
084 if(type == Token.SQ) {
085 assembly.add(join(tokens, quoteStart, i));
086 inquote = 0;
087 }
088 } else if(inquote == Token.DQ) {
089 if(type == Token.SQ) {
090 assembly.add(join(tokens, quoteStart, i));
091 inquote = 0;
092 }
093 }
094 }
095 return assembly;
096 }
097
098 public static String toString(List tokens) {
099 StringBuffer result = new StringBuffer();
100
101 for(Iterator it = tokens.iterator(); it.hasNext(); )
102 result.append( "token[ " + it.next() + " ]\n");
103
104 return result.toString();
105
106 }
107
108 }