/*
 * Decompiled with CFR 0.152.
 */
package org.exist.storage.analysis;

import org.exist.storage.analysis.TextToken;
import org.exist.storage.analysis.Tokenizer;

public class SimpleTokenizer
implements Tokenizer {
    private int pos = 0;
    private boolean stem = false;
    private CharSequence text;
    private int len = 0;
    private final TextToken temp = new TextToken();

    public SimpleTokenizer() {
    }

    public SimpleTokenizer(boolean stem) {
        this.stem = stem;
    }

    public void setStemming(boolean stem) {
        this.stem = stem;
    }

    private final char LA(int i) {
        int current = this.pos + i;
        return current > this.len ? (char)'\uffff' : this.text.charAt(current - 1);
    }

    protected TextToken alpha(TextToken token, boolean allowWildcards) {
        if (token == null) {
            token = new TextToken(1, this.text, this.pos);
        } else {
            token.setType(1);
        }
        char ch = this.LA(1);
        int count = 0;
        while (!(ch == '\uffff' || ch == '\\' && SimpleTokenizer.isWildcard(this.LA(2)))) {
            if (ch > '\u2e80' && SimpleTokenizer.singleCharToken(ch)) {
                if (count != 0) break;
                token.consumeNext();
                this.consume();
                ch = this.LA(1);
                break;
            }
            if (!Character.isLetter(ch) && !this.is_mark(ch) && !SimpleTokenizer.nonBreakingChar(ch) && (!allowWildcards || !SimpleTokenizer.isWildcard(ch))) break;
            token.consumeNext();
            this.consume();
            ch = this.LA(1);
            ++count;
        }
        if (Character.isDigit(ch)) {
            return this.alphanum(token, allowWildcards);
        }
        return token;
    }

    private static final boolean isWildcard(char ch) {
        return ch == '?' || ch == '*';
    }

    protected TextToken alphanum(TextToken token, boolean allowWildcards) {
        if (token == null) {
            token = new TextToken(2, this.text, this.pos);
        } else {
            token.setType(2);
        }
        while (this.LA(1) != '\uffff') {
            if (Character.isLetterOrDigit(this.LA(1))) {
                token.consumeNext();
                this.consume();
                continue;
            }
            if (!allowWildcards || !SimpleTokenizer.isWildcard(this.LA(1))) break;
            token.consumeNext();
            this.consume();
        }
        return token;
    }

    protected void consume() {
        ++this.pos;
    }

    protected TextToken eof() {
        this.consume();
        return TextToken.EOF_TOKEN;
    }

    public int getLength() {
        return this.len;
    }

    public String getText() {
        return ((Object)this.text).toString();
    }

    protected TextToken nextTerminalToken(boolean wildcards) {
        TextToken token = null;
        char ch = this.LA(1);
        if (ch == '\uffff') {
            return this.eof();
        }
        if (Character.isLetter(ch) || this.is_mark(ch) || SimpleTokenizer.nonBreakingChar(ch) || SimpleTokenizer.singleCharToken(ch) || wildcards && SimpleTokenizer.isWildcard(ch)) {
            token = this.alpha(null, wildcards);
        }
        if (token == null && (Character.isLetterOrDigit(ch) || wildcards && SimpleTokenizer.isWildcard(ch))) {
            token = this.alphanum(null, wildcards);
        }
        if (token == null) {
            switch (ch) {
                case '\\': {
                    if (SimpleTokenizer.isWildcard(this.LA(2))) {
                        this.consume();
                    }
                }
                case '*': 
                case ',': 
                case '-': 
                case '.': 
                case '/': 
                case ':': 
                case '@': 
                case '_': {
                    token = this.p();
                    break;
                }
                default: {
                    token = this.whitespace();
                }
            }
        }
        return token;
    }

    public TextToken nextToken() {
        return this.nextToken(false);
    }

    public TextToken nextToken(boolean wildcards) {
        try {
            while (true) {
                TextToken token = this.nextTerminalToken(wildcards);
                int oldPos = this.pos;
                char LA1 = this.LA(1);
                switch (token.getType()) {
                    case -1: {
                        return null;
                    }
                    case 1: {
                        switch (LA1) {
                            case '\'': {
                                this.consume();
                                TextToken next = this.nextTerminalToken(wildcards);
                                if (next != null && next.getType() == 1) {
                                    return new TextToken(1, this.text, token.startOffset(), next.endOffset());
                                }
                                this.pos = oldPos;
                                break;
                            }
                            case '.': 
                            case '/': 
                            case ':': 
                            case '@': 
                            case '_': {
                                TextToken next;
                                if (this.LA(2) == '\uffff' || Character.isWhitespace(this.LA(2))) {
                                    this.consume();
                                    break;
                                }
                                TextToken last = null;
                                while ((next = this.nextTerminalToken(wildcards)) != null && next.getType() != -1 && next.getType() != 8 && (next.getType() != 7 || this.LA(2) != '\uffff' && !Character.isWhitespace(this.LA(2)))) {
                                    last = next;
                                }
                                if (last != null) {
                                    token = new TextToken(2, this.text, token.startOffset(), last.endOffset());
                                    break;
                                }
                                this.pos = oldPos;
                            }
                        }
                        return token;
                    }
                    case 2: {
                        switch (LA1) {
                            case '*': 
                            case ',': 
                            case '-': 
                            case '.': 
                            case '/': 
                            case ':': 
                            case '@': 
                            case '_': {
                                TextToken next;
                                if (this.LA(2) == '\uffff' || Character.isWhitespace(this.LA(2))) {
                                    this.consume();
                                    break;
                                }
                                TextToken last = null;
                                while ((next = this.nextTerminalToken(wildcards)) != null && next.getType() != -1 && next.getType() != 8) {
                                    last = next;
                                }
                                token = last != null ? new TextToken(2, this.text, token.startOffset(), last.endOffset()) : new TextToken(2, this.text, token.startOffset(), this.pos);
                            }
                        }
                        return token;
                    }
                }
            }
        }
        catch (Exception e) {
            System.out.println("text: " + this.text);
            e.printStackTrace();
            return null;
        }
    }

    protected TextToken number() {
        TextToken token = new TextToken(6, this.text, this.pos);
        int oldPos = this.pos;
        while (this.LA(1) != '\uffff' && Character.isDigit(this.LA(1))) {
            token.consumeNext();
            this.consume();
        }
        if (Character.isLetter(this.LA(1))) {
            this.pos = oldPos;
            return null;
        }
        return token;
    }

    protected TextToken p() {
        this.temp.set(7, this.text, this.pos);
        this.temp.consumeNext();
        this.consume();
        return this.temp;
    }

    public void setText(CharSequence text) {
        this.pos = 0;
        this.len = text.length();
        this.text = text;
    }

    public void setText(CharSequence text, int offset) {
        this.pos = offset;
        this.len = text.length();
        this.text = text;
    }

    protected TextToken whitespace() {
        this.consume();
        return TextToken.WS_TOKEN;
    }

    private static final boolean singleCharToken(char ch) {
        return ch >= '\u2e80' && ch <= '\u2eff' || ch >= '\u2f00' && ch <= '\u2fdf' || ch >= '\u2ff0' && ch <= '\u2fff' || ch >= '\u3200' && ch <= '\u32ff' || ch >= '\u3300' && ch <= '\u33ff' || ch >= '\u3400' && ch <= '\u4db5' || ch >= '\u4dc0' && ch <= '\u4dff' || ch >= '\u4e00' && ch <= '\u9fff' || ch >= '\uf900' && ch <= '\ufaff' || ch >= '\ufe30' && ch <= '\ufe4f';
    }

    private static final boolean nonBreakingChar(char ch) {
        return ch >= '\u3040' && ch <= '\u309f' || ch >= '\u30a0' && ch <= '\u30ff' || ch >= '\u3100' && ch <= '\u312f' || ch >= '\u3130' && ch <= '\u318f' || ch >= '\u3190' && ch <= '\u319f' || ch >= '\u31a0' && ch <= '\u31bf' || ch >= '\u31f0' && ch <= '\u31ff' || ch >= '\uac00' && ch <= '\ud7a3';
    }

    private final boolean is_mark(char ch) {
        return ch > '\u093d' && ch < '\u094c';
    }

    public static void main(String[] args) {
        String t1 = "\u4ed6\u4e3a\u8fd9\u9879\u5de5\u7a0b\u6295\u5165\u4e86\u5341\u4e09\u5e74\u65f6\u95f4\u3002";
        SimpleTokenizer tokenizer = new SimpleTokenizer();
        tokenizer.setText(t1);
        TextToken token = tokenizer.nextToken(false);
        while (token != null && token.getType() != -1) {
            System.out.println(token.getText());
            token = tokenizer.nextToken(false);
        }
    }
}

