001    package com.saelist.stx.parser;
002    
003    import java.util.*;
004    import org.apache.log4j.*;
005    
006    /**
007      * Tokenizes free form structured text into @{link Token}s.
008      */
009    public class Tokenizer {
010    
011      public static Logger logger = Logger.getLogger("com.saelist.stx.parser.Tokenizer");
012    
013      public static List tokenize(String text) { // a[b] c[ 'defg' ]
014    
015        List tokens = new ArrayList(); // Change to linked list for efficiency.
016    
017        String delimiters = "'\"[] \t\n\r";
018        int line = 0; // Counting from 0.
019        int column = 0; // Counting from 0.
020        int tokenColumn = 0;
021        int tokenLine = 0;
022        int tokenStart = Integer.MAX_VALUE;
023        boolean textAvailable = false;
024        for(int i = 0; i < text.length(); i++) {
025    
026          char ch = text.charAt(i);
027          if(delimiters.indexOf(ch) >= 0) {
028    
029            if(textAvailable)
030              tokens.add(new Token(tokenLine, tokenColumn, Token.TEXT, text.substring(tokenStart, i)));
031    
032            tokens.add(new Token(line, column, ch, "" + ch));
033    
034            textAvailable = false;
035    
036          } else if(! textAvailable) {
037            tokenStart = i;
038            tokenColumn = column;
039            tokenLine = line;
040            textAvailable = true;
041          }
042    
043          if(ch == '\n') {
044            line ++;
045            column = 0;
046          } else
047            column++;
048    
049        }
050        return tokens;
051      }
052    
053      /** Create a new token with the same column, line and type as the start
054          token but with the concatenation of all token values between start
055          and stop as value.
056        */
057      static Token join(List tokens, int start, int stop) {
058        StringBuffer  buffer = new StringBuffer();
059        for(int i = start + 1; i <= stop - 1; i++)
060          buffer.append(((Token) tokens.get(i)).getValue());
061        Token startToken = (Token) tokens.get(start);
062        logger.debug("join(): buffer=" + buffer.toString());
063        return new Token(startToken.getLine(), startToken.getColumn(), startToken.getType(), buffer.toString());
064      }
065    
066      /** Reconstructs quoted text and removes unquoted whitespace.
067          E.g the three tokens "'", "def", "'" are joined in "'abc'"
068        */
069      public static List assemble(List tokens) {
070        int inquote = 0; // 0 = not in quote, SQ = in single quote, DQ = in double quote
071        int quoteStart = 0;
072        int i = 0;
073        List assembly = new ArrayList(); // Change to linked list for efficiency.
074        for(Iterator it = tokens.iterator(); it.hasNext(); i++ ) {
075          Token token = (Token) it.next();
076          int type = token.getType();
077          if(inquote == 0) {
078            if(type == Token.SQ || type == Token.DQ) {
079              inquote = type;
080              quoteStart = i;
081            } else if(type != Token.SPACE && type != Token.NEWLINE && type != Token.CR && type != Token.TAB)
082              assembly.add(token);
083          } else if(inquote == Token.SQ) {
084            if(type == Token.SQ) {
085              assembly.add(join(tokens, quoteStart, i));
086              inquote = 0;
087            }
088          } else if(inquote == Token.DQ) {
089            if(type == Token.SQ) {
090              assembly.add(join(tokens, quoteStart, i));
091              inquote = 0;
092            }
093          }
094        }
095        return assembly;
096      }
097    
098      public static String toString(List tokens) {
099        StringBuffer result = new StringBuffer();
100    
101        for(Iterator it = tokens.iterator(); it.hasNext(); )
102          result.append( "token[ " + it.next() + " ]\n");
103    
104        return result.toString();
105    
106      }
107    
108    }