/*
 * Decompiled with CFR 0.152.
 */
package opennlp.tools.tokenize;

import java.util.LinkedList;
import java.util.Set;
import java.util.regex.Pattern;
import opennlp.tools.tokenize.Tokenizer;
import opennlp.tools.tokenize.WhitespaceTokenizer;
import opennlp.tools.util.Span;

public class WordpieceTokenizer
implements Tokenizer {
    private static final Pattern PUNCTUATION_PATTERN = Pattern.compile("\\p{Punct}+");
    private static final String CLASSIFICATION_TOKEN = "[CLS]";
    private static final String SEPARATOR_TOKEN = "[SEP]";
    private static final String UNKNOWN_TOKEN = "[UNK]";
    private final Set<String> vocabulary;
    private int maxTokenLength = 50;

    public WordpieceTokenizer(Set<String> vocabulary) {
        this.vocabulary = vocabulary;
    }

    public WordpieceTokenizer(Set<String> vocabulary, int maxTokenLength) {
        this(vocabulary);
        this.maxTokenLength = maxTokenLength;
    }

    @Override
    public Span[] tokenizePos(String text) {
        return null;
    }

    @Override
    public String[] tokenize(String text) {
        String[] split;
        LinkedList<Object> tokens = new LinkedList<Object>();
        tokens.add(CLASSIFICATION_TOKEN);
        String spacedPunctuation = PUNCTUATION_PATTERN.matcher(text).replaceAll(" $0 ");
        block0: for (String token2 : split = WhitespaceTokenizer.INSTANCE.tokenize(spacedPunctuation)) {
            char[] characters = token2.toCharArray();
            if (characters.length <= this.maxTokenLength) {
                int start = 0;
                while (start < characters.length) {
                    int end;
                    boolean found = false;
                    for (end = characters.length; start < end; --end) {
                        Object substring = String.valueOf(characters, start, end - start);
                        if (start > 0) {
                            substring = "##" + (String)substring;
                        }
                        if (!this.vocabulary.contains(substring)) continue;
                        tokens.add(substring);
                        start = end;
                        found = true;
                        break;
                    }
                    if (!found) {
                        tokens.add(UNKNOWN_TOKEN);
                        continue block0;
                    }
                    start = end;
                }
                continue;
            }
            tokens.add(UNKNOWN_TOKEN);
        }
        tokens.add(SEPARATOR_TOKEN);
        return tokens.toArray(new String[0]);
    }

    public int getMaxTokenLength() {
        return this.maxTokenLength;
    }
}

