/* $Id: String_constraint.cpp 638254 2021-09-27 14:11:38Z grichenk $
 * ===========================================================================
 *
 *                            PUBLIC DOMAIN NOTICE
 *               National Center for Biotechnology Information
 *
 *  This software/database is a "United States Government Work" under the
 *  terms of the United States Copyright Act.  It was written as part of
 *  the author's official duties as a United States Government employee and
 *  thus cannot be copyrighted.  This software/database is freely available
 *  to the public for use. The National Library of Medicine and the U.S.
 *  Government have not placed any restriction on its use or reproduction.
 *
 *  Although all reasonable efforts have been taken to ensure the accuracy
 *  and reliability of the software and data, the NLM and the U.S.
 *  Government do not and cannot warrant the performance or results that
 *  may be obtained by using this software or data. The NLM and the U.S.
 *  Government disclaim all warranties, express or implied, including
 *  warranties of performance, merchantability or fitness for any particular
 *  purpose.
 *
 *  Please cite the author in any work or product based on this material.
 *
 * ===========================================================================
 *
 * Author:  J. Chen
 *
 * File Description:
 *   Evaluate if a string and an object match to CString_constraint
 *
 * Remark:
 *   This code was originally generated by application DATATOOL
 *   using the following specifications:
 *   'macro.asn'.
 */

#include <ncbi_pch.hpp>
#include <objects/macro/String_constraint.hpp>
#include <objects/seqfeat/Seq_feat.hpp>
#include <objects/seqfeat/Imp_feat.hpp>

BEGIN_NCBI_SCOPE
BEGIN_objects_SCOPE // namespace ncbi::objects::


const vector<string> CString_constraint::s_WeaselWords = {
    "candidate",
    "hypothetical",
    "novel",
    "possible",
    "potential",
    "predicted",
    "probable",
    "putative",
    "uncharacterized",
    "unique"
};


namespace
{
    bool x_IsWordCharacter(char c) {
        return (c >= '0' && c <= '9') || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_' || c == '-';
    }
    // split on spaces and punctuation
    void x_Split(const string& s, vector<string>& v)
    {
        size_t i;
        int n = -1;
        for (i = 0; i < s.length(); i++) {
            if (s[i] == ' ') { // assuming that tabs and other empty space characters are addressed before
                if (n != -1) {
                    v.push_back(s.substr(n, i - n));
                    n = -1;
                }
            }
            else if (x_IsWordCharacter(s[i])) {
                if (n == -1) {
                    n = i;
                }
                else if (!x_IsWordCharacter(s[n])) {
                    v.push_back(s.substr(n, i - n));
                    n = i;
                }
            }
            else {
                if (n == -1) {
                    n = i;
                }
                else if (x_IsWordCharacter(s[n])) {
                    v.push_back(s.substr(n, i - n));
                    n = i;
                }
            }
        }
        if (n != -1) {
            v.push_back(s.substr(n, i - n));
        }
    }

    string x_Assemble(vector<string>& v, vector<bool>& skip)
    {
        bool first = true;
        string s;
        for (size_t i = 0; i < v.size(); i++) {
            if (!skip[i]) {
                if (!first && x_IsWordCharacter(v[i][0])) {
                    s += ' ';
                }
                s += v[i];
                first = false;
            }
        }
        return s;
    }


    // El sueno de la razon produce monstruos
    CTempString x_StripUnimportantCharacters(string& storage, const CTempString& str, bool strip_space, bool strip_punct)
    {
        if (str.empty()) {
            return kEmptyStr;
        }
        if (!strip_space && !strip_punct)
            return str;

        bool has_stripped = false;
        CTempString::size_type i = 0;
        const char* s = str.data();
        for (; i < str.size(); i++, s++)
        {
            if ((strip_space && isspace(*s)) || (strip_punct && ispunct(*s)))
            {
                if (!has_stripped) // first occurence
                {
                    storage.reserve(str.size()-1); // at least one symbol will be removed
                    storage.clear();
                    storage.append(str.data(), i);
                    has_stripped = true;
                }
            }
            else
            {
                if (has_stripped)
                   storage.push_back(*s);
            }
        }

        if (has_stripped)
            return storage;
        else
            return str;
    };

    inline
    bool x_DisallowCharacter(const char ch, bool disallow_slash)
    {
        if (isalpha(Uint1(ch)) || isdigit(Uint1(ch)) || ch == '_' || ch == '-') return true;
        else if (disallow_slash && ch == '/') return true;
        else return false;
    };

};


// Including state machine
#define _FSM_EMIT static bool s_Weasel_emit[]
#define _FSM_HITS static map<size_t, vector<size_t>> s_Weasel_hits
#define _FSM_STATES static size_t s_Weasel_states[]
#include "weasel.inc"
#undef _FSM_EMIT
#undef _FSM_HITS
#undef _FSM_STATES


void CMatchString::x_PopWeasel() const
{
    m_noweasel_start = 0;
    const auto& callback = [&](size_t n, size_t p) {
        if (n < CString_constraint::s_WeaselWords.size()) {
            if (p - m_noweasel_start == CString_constraint::s_WeaselWords[n].length()) {
                m_noweasel_start = p;
                m_weaselmask |= (1 << n);
            }
        }
        else { // space
            if (p == m_noweasel_start) {
                m_noweasel_start = p + 1;
            }
        }
    };
    CMultipatternSearch::Search(this->m_original, s_Weasel_states, s_Weasel_emit, s_Weasel_hits, callback);
}


CString_constraint::CString_constraint()
{
}


CString_constraint::~CString_constraint()
{
}


bool CString_constraint :: Empty() const
{
   if (GetIs_all_caps() ||
       GetIs_all_lower() ||
       GetIs_all_punct() ||
       GetIs_first_cap() ||
       GetIs_first_each_cap()) {
      return false;
   } else if (!CanGetMatch_text() || GetMatch_text().empty()) {
        return true;
   }

   return false;
}

bool CString_constraint::x_IsAllCaps(const CMatchString& str) const
{
    return x_GetCompareString(str, e_original) == x_GetCompareString(str, e_uppercase);
}

bool CString_constraint::x_IsAllLowerCase(const CMatchString& str) const
{
    return x_GetCompareString(str, e_original) == x_GetCompareString(str, e_lowercase);
}

bool CString_constraint::x_IsAllPunctuation(const CMatchString& str) const
{
   CTempString match = x_GetCompareString(str, e_original);
   for (unsigned i=0; i< match.size(); i++) {
     if (!ispunct(match[i])) return false;
   }
   return true;
}

bool CString_constraint::x_IsSkippable(const char ch) const
{
    if (ispunct(ch) && GetIgnore_punct()) {
        return true;
    } else if (isspace(ch) && GetIgnore_space()) {
        return true;
    } else {
        return false;
    }
}

bool CString_constraint::x_IsAllSkippable(const CTempString& match) const
{
    for (CTempString::size_type i = 0; i < match.size(); i++) {
        if (!x_IsSkippable(match[i])) {
            return false;
        }
    }
    return true;
}

bool CString_constraint::x_IsFirstCap(const CMatchString& s) const
{
    CTempString str = x_GetCompareString(s, e_original);
    // ignore punctuation and spaces at the beginning of the phrase
    CTempString::const_iterator it = str.begin();
    while (it != str.end() && !isalpha((unsigned char) (*it))) {
    if (isdigit( (unsigned char) (*it))) {
        return false;
    }
        ++it;
    }

    if (it != str.end()) {
        return isalpha((unsigned char) (*it)) && isupper((unsigned char) (*it));
    }
    return false;
}

bool CString_constraint::x_IsFirstEachCap(const CMatchString& s) const
{
    CTempString str = x_GetCompareString(s, e_original);
    bool first(true);
    bool rval(true);
    for (size_t i = 0; i < str.size() && rval; ++i) {
        if (isalpha( (unsigned char) str[i])) {
            if (first) {
                rval = rval && isupper( (unsigned char) str[i] );
                first = false;
            } 
        } else if ( str[i] == '-' ){
            // hyphenated words are considered as one composed word
            if ((i > 0 && !isalpha( (unsigned char) str[i - 1])) || 
                (i + 1 < str.size() && !isalpha( (unsigned char) str[i + 1] ))) 
                first = true;
        } else if (isdigit( (unsigned char) str[i])){
            if (i + 1 < str.size() && isalpha( (unsigned char) str[i + 1])) {
                rval = false;
            }
        } else {
            first = true;
        }
    }
    return rval;
}

bool CString_constraint::x_IsWholeWordMatch(const CTempString& start, size_t found, size_t match_len, bool disallow_slash) const
{
    size_t after_idx;
    if (!match_len) {
        return true;
    }
    else if (start.empty() || found == string::npos) {
        return false;
    }
    else {
        if (found) {
            if (x_DisallowCharacter (start[found-1], disallow_slash)) {
                return false;
            }
        }
        after_idx = found + match_len;
        if (after_idx < start.size() && x_DisallowCharacter(start[after_idx], disallow_slash)) {
            return false;
        }
    }
    return true;
};


bool CString_constraint :: x_PartialCompare(const string& str, const string& pattern, char prev_char, size_t & match_len) const
{
    // check for synonyms to skip
    if (IsSetIgnore_words()) {
        ITERATE(CWord_substitution_set::Tdata, word, GetIgnore_words().Get()) {
            vector<size_t> match_lens = (*word)->GetMatchLens(str, pattern, prev_char);
            if (match_lens.size() > 0) {
                size_t word_len = (*word)->GetWord().length();
                ITERATE(vector<size_t>, len, match_lens) {
                    size_t this_match = 0;
                    char this_prev_char = 0;
                    if (*len > 0) {
                        this_prev_char = str.c_str()[(*len) - 1];
                    } else {
                        this_prev_char = prev_char;
                    }
                    bool require_end = false;
                    if (GetMatch_location() == eString_location_equals || GetMatch_location() == eString_location_ends) {
                        require_end = true;
                    }
                    if (x_PartialCompare(str.substr(*len), pattern.substr(word_len), this_prev_char, this_match) &&
                        (!require_end || this_match == str.substr(*len).length())) {
                        match_len += *len;
                        match_len += this_match;                        
                        return true;
                    }
                }
            }
        }
    }

    if (pattern.length() == 0) {
        return true;
    }

    if (str.length() == 0) {
        if (x_IsAllSkippable(pattern)) {
            return true;
        }
        // special case: can continue if the next character is a space, might have words to ignore
        if (isspace(pattern[0])) {
            return x_PartialCompare(str, pattern.substr(1), ' ', match_len);
        } else {
            return false;
        }
    }
    if (GetIgnore_space()) {
        if (isspace(Uint1(str[0]))) {
            match_len++;
            return x_PartialCompare(str.substr(1), pattern, str[0], match_len);
        } else if (isspace(Uint1(pattern[0]))) {
            return x_PartialCompare(str, pattern.substr(1), prev_char, match_len);
        }
    }
    if (GetIgnore_punct()) {
        if (ispunct(Uint1(str[0]))) {
            match_len++;
            return x_PartialCompare(str.substr(1), pattern, str[0], match_len);
        } else if (ispunct(Uint1(pattern[0]))) {
            return x_PartialCompare(str, pattern.substr(1), prev_char, match_len);
        }
    }
    if (str[0] == pattern[0]) {
        match_len++;
        return x_PartialCompare(str.substr(1), pattern.substr(1), str[0], match_len);
    } else if ((!IsSetCase_sensitive() || !GetCase_sensitive()) &&
               tolower(Uint1(str[0])) == tolower(Uint1(pattern[0]))) {
        match_len++;
        return x_PartialCompare(str.substr(1), pattern.substr(1), str[0], match_len);
    }
        
    return false;
}


bool CString_constraint :: x_AdvancedStringCompare(const string& str, const string& str_match, const char prev_char, size_t * ini_target_match_len)  const
{
    bool rval = false;
    size_t match_len = 0;
    if (x_PartialCompare(str, str_match, prev_char, match_len)) {
        if (ini_target_match_len != NULL) {
            *ini_target_match_len = match_len;
        }
        rval = true;
    }

    return rval;
};

bool CString_constraint::x_AdvancedStringMatch(const string& str, const string& tmp_match) const
{
    bool rval = false;
    string match_text = CanGetMatch_text() ? tmp_match : kEmptyStr;

    size_t match_len = 0;

    if (x_AdvancedStringCompare (str, match_text, 0, &match_len) && (GetMatch_location() != eString_location_equals || match_len == str.length())) {
        return true;
    }
    else if (GetMatch_location() == eString_location_starts || GetMatch_location() == eString_location_equals) {
        return false;
    }
    else {
        size_t pos = 1;
        size_t len = str.size();
        while (!rval && pos < len) {
            if (GetWhole_word()) {
                while (pos < len && isalpha (Uint1(str[pos-1]))) pos++;
            }
            if (pos < len) {
                size_t sub_match_len = 0;
                if (x_AdvancedStringCompare (str.substr(pos), match_text, 0, &sub_match_len)) {
                    if (sub_match_len < len - pos && GetMatch_location() == eString_location_ends) {
                        pos++;
                    }
                    else {
                        rval = true;
                    }
                }
                else {
                    pos++;
                }
            }
        }
    }
    return rval;
};


CTempString CString_constraint::x_GetConstraintString(ECase e_case) const
{
    if (CanGetMatch_text() && m_match.original().original().empty()) {
        m_match = GetMatch_text();
    }
    if (m_match.original().original().empty()) {
        return "";
    }
    if (e_case == e_automatic) {
        e_case = GetCase_sensitive() ? e_original : e_lowercase;
    }
    switch (e_case) {
        case e_automatic:
        case e_original:
            return m_match.original().original();
        case e_lowercase:
            return m_match.original().lowercase();
        case e_uppercase:
            return m_match.original().uppercase();
    }
}


CTempString CString_constraint::x_GetCompareString(const CMatchString& s, ECase e_case) const
{
    if (m_match.original().original().empty()) {
        x_GetConstraintString();
    }

    if (e_case == e_automatic) {
        e_case = GetCase_sensitive() ? e_original : e_lowercase;
    }
    if (GetIgnore_weasel() && !(m_match.GetWeaselMask() & s.GetWeaselMask())) {
        switch (e_case) {
            case e_automatic:
            case e_original:
                return s.GetNoweasel();
            case e_lowercase:
                return s.GetNoweaselLC();
            case e_uppercase:
                return s.GetNoweaselUC();
        }
    }
    else {
        switch (e_case) {
            case e_automatic:
            case e_original:
                return s.original().original();
            case e_lowercase:
                return s.original().lowercase();
            case e_uppercase:
                return s.original().uppercase();
        }
    }
}


bool CString_constraint::x_MatchFound(CTempString& search, CTempString& pattern) const
{
    TMatch_location loc = GetMatch_location();
    if (loc == eString_location_equals) {
        return search == pattern;
    }
    SIZE_TYPE found = search.find(pattern);
    if (found == NPOS) {
        return false;
    }
    else if (loc == eString_location_starts) {
        return found == 0 && (!GetWhole_word() || x_IsWholeWordMatch(search, found, pattern.size()));
    }
    else if (loc == eString_location_contains) {
        CTempString next_guess = search.substr(found + 1);
        return (!GetWhole_word() || x_IsWholeWordMatch(search, found, pattern.size())) ? true : x_MatchFound(next_guess, pattern);
    }
    else if (loc == eString_location_ends) {
        CTempString next_guess = search.substr(found + 1);
        return found + pattern.size() == search.size() && (!GetWhole_word() || x_IsWholeWordMatch(search, found, pattern.size())) ? true : x_MatchFound(next_guess, pattern);
    }
    return false;
}


bool CString_constraint::x_DoesSingleStringMatchConstraint(const CMatchString& str) const
{
    if (str.original().original().empty()) {
        return false;
    }

   // bool rval(false);
    if (Empty()) {
        return true;
    }
    
    if (GetIs_all_caps() && !x_IsAllCaps(str)) {
        return false;
    }
    else if (GetIs_all_lower() && !x_IsAllLowerCase(str)) {
        return false;
    }
    else if (GetIs_all_punct() && !x_IsAllPunctuation(str)) {
        return false;
    }
    else if (GetIs_first_cap() && !x_IsFirstCap(str)) {
        return false;
    }
    else if (GetIs_first_each_cap() && !x_IsFirstEachCap(str)) {
        return false;
    }
    else {
        if (GetMatch_location() == eString_location_inlist) {
            cout << "eString_location_inlist is not supported!\n";
            return false;
        }

        CTempString pattern = x_GetConstraintString();
        CTempString search = x_GetCompareString(str);

        unsigned mask = GetIgnore_weasel() ? m_match.GetWeaselMask() : 0;
        unsigned str_mask = GetIgnore_weasel() ? str.GetWeaselMask() : 0;

        if ((mask & str_mask) != mask) {
            return false; // shortcut
        }

        if (GetMatch_location() != eString_location_inlist && CanGetIgnore_words()){
            if (mask) {
                cout << pattern << " <===> " << search << "\nSelf-weasel case with ignored words is not supported!\n";
                return false;
            }
            return x_AdvancedStringMatch(search, pattern);
        }
        else {
            string s_search, p_search;
            if (GetMatch_location() != eString_location_inlist && (GetIgnore_space() || GetIgnore_punct())) {
                search = x_StripUnimportantCharacters(s_search, search, GetIgnore_space(), GetIgnore_punct());
                pattern = x_StripUnimportantCharacters(p_search, pattern, GetIgnore_space(), GetIgnore_punct());
            }

            if (!mask) { // no self-weasel
                return x_MatchFound(search, pattern);
            }

            // clinical case
            vector<string> v;
            x_Split(search, v);
            vector<bool> skip(v.size(), false);
            vector<size_t> test;
            for (size_t i = 0; i < v.size(); i++) {
                for (size_t k = 0; k < CString_constraint::s_WeaselWords.size(); k++) {
                    unsigned m = (1 << k);
                    if (m & str_mask) {
                        string lower = v[i];
                        NStr::ToLower(lower);
                        if (lower == CString_constraint::s_WeaselWords[k]) {
                            if (m & mask) {
                                test.push_back(i);
                            }
                            skip[i] = true;
                        }
                    }
                }
            }
            // combinatorics
            while (true) {
                string guess = x_Assemble(v, skip);
                CTempString next_guess = guess; // Using CTempString everywhere was a bad idea...
                if (x_MatchFound(next_guess, pattern)) {
                    return true;
                }
                for (size_t i = 0; i < test.size(); i++) {
                    if (skip[test[i]]) {
                        skip[test[i]] = false;
                        break;
                    }
                    else {
                        skip[test[i]] = true;
                        if (i == test.size() - 1) {
                            return false;
                        }
                    }
                }
            }
        }
    }
    return false;
}

bool CString_constraint::Match(const CMatchString& str) const
{
    bool rval = x_DoesSingleStringMatchConstraint (str);
    return GetNot_present() ? (!rval) : rval;
}

bool CString_constraint::x_ReplaceContains(string& val, const string& replace) const
{
    bool rval = false;

    size_t offset = 0;
    while (offset < val.length()) {
        size_t match_len = 0;
        if (x_AdvancedStringCompare(val.substr(offset), GetMatch_text(),
                                    offset == 0 ? 0 : val.c_str()[offset - 1],
                                    &match_len)) {
            val = val.substr(0, offset) + replace + val.substr(offset + match_len);
            rval = true;
            offset += replace.length();
        } else {
            offset++;
        }
    }
    return rval;
}


bool CString_constraint::ReplaceStringConstraintPortionInString(string& result, const CMatchString& str, const string& replace) const
{
    bool rval = false;
    
    const string& val = str;

    if (val.empty()) {
        if (Empty() || (IsSetNot_present() && GetNot_present())) {
            result = replace;
            rval = true;
        }
    } else if (Empty()) {
        result = replace;
        rval = true;
    } else {
        if (IsSetMatch_location()) {
            switch (GetMatch_location()) {
                case eString_location_inlist:
                case eString_location_equals:
                    result = replace;
                    rval = true;
                    break;
                case eString_location_starts:
                    {{
                       size_t match_len = 0;
                       if (x_AdvancedStringCompare(val, GetMatch_text(), 0, &match_len)) {
                           result = replace;
                           result.append(val.data()+match_len, val.length()-match_len);                           
                           rval = true;
                       }
                    }}
                    break;
                case eString_location_contains:
                    result = val;
                    rval = x_ReplaceContains(result, replace);
                    break;
                case eString_location_ends:
                    {{
                        size_t offset = 0;
                        while (!rval && offset < val.length()) {
                            size_t match_len = 0;
                            if (x_AdvancedStringCompare(val.substr(offset), 
                                                         GetMatch_text(),
                                                         (offset == 0 ? 0 : val.c_str()[offset - 1]), 
                                                          &match_len)
                                && offset + match_len == val.length()) {
                                result = val.substr(0, offset) + replace;
                                rval = true;
                            } else {
                                offset++;
                            }
                        }
                    }}
                    break;
            } 
        } else {
            result = val;
            rval = x_ReplaceContains(result, replace);                    
        }
    }
    return rval;
}

END_objects_SCOPE // namespace ncbi::objects::
END_NCBI_SCOPE
