/*
 * Decompiled with CFR 0.152.
 */
package org.apache.sysds.runtime.transform.tokenize.builder;

import java.util.ArrayList;
import java.util.List;
import org.apache.sysds.runtime.DMLRuntimeException;
import org.apache.sysds.runtime.frame.data.FrameBlock;
import org.apache.sysds.runtime.transform.tokenize.DocumentRepresentation;
import org.apache.sysds.runtime.transform.tokenize.Token;
import org.apache.sysds.runtime.transform.tokenize.builder.TokenizerBuilderWhitespaceSplit;
import org.apache.sysds.runtime.util.UtilFunctions;
import org.apache.wink.json4j.JSONException;
import org.apache.wink.json4j.JSONObject;

public class TokenizerBuilderNgram
extends TokenizerBuilderWhitespaceSplit {
    private static final long serialVersionUID = -6297904316677723802L;
    public int minGram = 1;
    public int maxGram = 2;
    public NgramType ngramType = NgramType.DOCUMENT;

    public TokenizerBuilderNgram(int[] idCols, int tokenizeCol, JSONObject params) throws JSONException {
        super(idCols, tokenizeCol, params);
        if (params != null && params.has("min_gram")) {
            this.minGram = params.getInt("min_gram");
        }
        if (params != null && params.has("max_gram")) {
            this.maxGram = params.getInt("max_gram");
        }
        if (params != null && params.has("ngram_type")) {
            String type = params.getString("ngram_type").toLowerCase();
            if (type.equals("document")) {
                this.ngramType = NgramType.DOCUMENT;
            } else if (type.equals("token")) {
                this.ngramType = NgramType.TOKEN;
            } else {
                throw new DMLRuntimeException("Invalid ngram type, choose between 'token' and 'document'");
            }
        }
    }

    public List<Token> splitIntoNgrams(Token token, int minGram, int maxGram) {
        if (token.getNumSubTokens() == 0) {
            throw new DMLRuntimeException("Cannot create ngram of token where there are no subTokens");
        }
        if (token.getNumSubTokens() != 1) {
            throw new DMLRuntimeException("Cannot create ngram of token where there are more than 1 subTokens");
        }
        String tokenText = token.toString();
        ArrayList<Token> newTokens = new ArrayList<Token>();
        for (int n = minGram; n <= maxGram; ++n) {
            for (int i = 0; i < tokenText.length() - n + 1; ++i) {
                String substring = tokenText.substring(i, i + n);
                newTokens.add(new Token(substring, token.getStartIndex(0) + (long)i));
            }
        }
        return newTokens;
    }

    @Override
    public void createInternalRepresentation(FrameBlock in, DocumentRepresentation[] internalRepresentation, int rowStart, int blk) {
        super.createInternalRepresentation(in, internalRepresentation, rowStart, blk);
        int endIndex = UtilFunctions.getEndIndex(in.getNumRows(), rowStart, blk);
        for (int row = rowStart; row < endIndex; ++row) {
            DocumentRepresentation documentRepresentation = internalRepresentation[row];
            if (this.ngramType == NgramType.DOCUMENT) {
                documentRepresentation.splitIntoNgrams(this.minGram, this.maxGram);
                continue;
            }
            if (this.ngramType != NgramType.TOKEN) continue;
            ArrayList<Token> newTokens = new ArrayList<Token>();
            for (Token wordToken : documentRepresentation.getTokens()) {
                newTokens.addAll(this.splitIntoNgrams(wordToken, this.minGram, this.maxGram));
            }
            documentRepresentation.tokens = newTokens;
        }
    }

    private static enum NgramType {
        DOCUMENT,
        TOKEN;

    }
}

