# -*- coding: utf-8 -*-
"""
chemdataextractor.nlp.cem
~~~~~~~~~~~~~~~~~~~~~~~~~
Named entity recognition (NER) for Chemical entity mentions (CEM).
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
import logging
import re
import six
from ..text import bracket_level
from .lexicon import ChemLexicon
from .tag import BaseTagger, CrfTagger, DictionaryTagger
log = logging.getLogger(__name__)
#: Token endings to ignore when considering stopwords and deriving spans
IGNORE_SUFFIX = [
# Many of these are now unnecessary due to tokenization improvements, but not much harm in leaving them here.
'-', '\'s', '-activated', '-adequate', '-affected', '-anesthetized', '-based', '-binding', '-boosted', '-cane',
'-conditioned', '-containing', '-covered', '-deficient', '-dependent', '-derived', '-electrolyte', '-enriched',
'-exposed', '-flanking', '-free', '-fused', '-gated', '-glucuronosyltransferases', '-increasing', '-induced',
'-inducible', '-l-tyrosine', '-labeled', '-lesioned', '-loaded', '-mediated', '-patterned', '-primed', '-reducing',
'-regulated', '-releasing', '-resistant', '-response', '-rich', '-s-transferase', '-sensitive', '-soluble',
'-stimulated', '-stressed', '-supplemented', '-terminal', '-transferase', '-treated', '-type', '-blood',
'-specific', '-like', '-elicited', '-stripped', '-transfer', '-conjugate', '-coated', '-producing', '-oxidized',
'-associated', '-related', '-converting', '-ligand', '-on-glass', '-seeking', '-hydrolyzing', '-o-deethylase',
'-deethylase', '-o-depentylase', '-depentylase', '-n-demethylase', '-demethylase', '-o-methyltransferase',
'-c-oxidase', '-oxidase', '-n-biosidase', '-biosidase', '-immunoproteins', '-spiked', '-lowering', '-page',
'-depletion', '-formation', '-dealkylation', '-deethylation', '-alkylation', '-ribosylation', '-production',
'-demethylation', '-oxidation', '-transition', '-glycosylation', '-zwitterion', '-benzylation', '-reduction',
'-oxygenation', '-nitrosylation', '-evoked', '-mutated', '-doped', '-aged', '-increased', '-triggered', '-linked',
'-fixed', '-injected', '-contaminated', '-depleted', '-enhanced', '-stained', '-modified', '-fed', '-demethylated',
'-catalyzed', '-etched', '-labelled', '-conjugated', '-pretreated', '-ribosylated', '-phosphorylated', '-reduced',
'-bonded', '-stabilised', '-crosslinked', '-mannosylated', '-capped', '-supported', '-initiated', '-integrated',
'-accelerated', '-encapsulated', '-untreated', '-expanded', '-coupled', '-terminated', '-assisted',
'-permeabilized', '-resulted', '-alkylated', '-functionalized', '-contained', '-buffered', '-caused', '-cyclized',
'-substituted', '-modulated', '-inhibited', '-centered', '-promoted', '-confirmed', '-provoked', '-dominated',
'-limited', '-challenged', '-tetrabrominated', '-unesterified', '-refreshed', '-bottled', '-protonated',
'-incubated', '-tagged', '-damaged', '-bridged', '-maintained', '-impregnated', '-metabolizing', '-deprived',
'-insensitive', '-dendrimer', '-receptor', '-tolerant', '-influx', '-administrated', '-requiring', '-permeable',
'-transport', '-intoxicated', '-overload', '-derivatives', '-derivative', '-sweetened', '-transporter', '-bound',
'-extract', '-bonding', '-bond', '-trna', '-redistribution', '-copolymers', '-copolymer', '-appended',
'-susceptible', '-transfected', '-bearing', '-regenerating', '-induction', '-conducting', '-decorated',
'-encapsulating', '-consuming', '-bridge', '-dependence', '-Pdots', '-only', '-carrying', '-treating', '-isomerase',
'-ion', '-ions', '-coordinated', '-saturated', '-sparing', '-enclosed', '-stabilized', '-polymer', '-yeast',
'-making', '-porous', '-independent', '-metallized', '-attenuated', '-liquid', '-caged', '-deficiency', '-sensing',
'-recognition', '-responsiveness', '-embedded', '-connectivity', '-abuse', '-chelating', '-decocted', '-forming',
'-nutrition', '-scavenging', '-preferring', '-mimicking', '-drugs', '-drug', '-lubricants', '-adsorption',
'-ligated', '-detected', '-responsive', '-reacting', '-defined', '-capturing', '-group', '-abstinent', '-paired',
'-devalued', '-need', '-cellulose', '-atpase', '-inactivated', '-β-glucosaminidase', '-glucosaminidase', '-dosed',
'-imprinted', '-precipitated', '-monoadducts', '-vacancies', '-vacancy', '-attributed', '-depolarization',
'-depolarized', '-liver', '-testes', '-reversible', '-active', '-reactive', '-dextran', '-fixing', '-synthesizing',
'-inhibitory', '-cleaving', '-positive', '-activity', '-fluorescence', '-regulating', '-NPs', '-scanning',
'-water', '-nmr', '-limiting', '-refractory', '-knot', '-variable', '-biomolecule', '-backbone', '-exchange',
'-donating', '-coating', '-hydrogenase', '-hydrogenases', '-intolerant', '-deplete', '-poor', '-loading',
'-enrichment', '-elevating', '-resitant', '-stabilizing', '-pathway', '-fortified', '-adjusted',
'-restricted', '-dependant', '-locked', '-normalized', '-aromatic', '-hydroxylation', '-intermediate',
'-6-phosphatase', '-phosphatase', '-linker', '-proteomic', '-mimetic', '-lipid', '-radical', '-receptors',
'-substrate', '-conjugates', '-promoting', '-dye', '-functionalyzed', '-catalysed', '-reductase', '-QDs',
'-complexes', '-placebo', '-transferases', '-alginate', '-competing', '-depleting', '-sensitized',
'-protein', '-regulatory', '-target', '-toxin', '-yield', '-planted', '-produced', '-derivatized', '-secreting',
'-modifying', '-DNA', '-bonds', '-assemblages', '-exposure', '-negative', '-sealed', '-atom', '-atoms',
'-abstraction', '-concentration', '-doping', '-competitive', '-acclimation', '-acclimated', '-interlinked',
'-suppressed', '-postlabeling', '-labeling', '-diabetic', '-omitted', '-sufficient', '-generating', '-terminus',
'-adducts', '-compound', '-compounds', '-γ-lyase', '-γ-synthase', '-lyase', '-synthase', '-inhibitor',
'-protected', '-multiwall', '-stripping', '-plasma', '-evolving'
]
#: Token beginnings to ignore when considering stopwords and deriving spans
IGNORE_PREFIX = [
'fluorophore-', 'low-', 'high-', 'single-', 'odd-', 'non-', 'high-', 'cross-', 'cellulose-', 'anti-', '-multiwall',
'globular-', 'plasma-', 'hybrid-', 'protein-', 'explicit-', 'cation-', 'water-', 'through-', 'starch-', 'rigid-',
'conjugated-', 'photoactivatable-', 'alginate-', 'nano-', 'dye-', 'ligand-', 'enzyme-', 'platelet-', 'photo-',
'total-', 'drug-', 'nanoparticle-', 'nanomaterial-', 'inter-', 'ion-', 'post-', 'one-'
]
#: Final tokens to remove from entity matches
STRIP_END = [
'groups', 'group', 'colloidal', 'dyes', 'dye', 'products', 'product', 'substances', 'substance', 'solution',
'derivatives', 'derivative', 'analog', 'salts', 'salt', 'minerals', 'mineral', 'anesthetic', 'tablet', 'tablets',
'preparation', 'atoms', 'atom', 'monomers', 'monomer', 'nanoparticles', 'nanoparticle', 'radicals', 'radical',
'dendrimers', 'dendrimer', 'ions', 'ion', 'particles', 'particle', 'anion', 'cation', 'foam', 'cellulose',
'dextran', '(', 'dust', 'herbicide', 'disease', 'diseases', 'and', 'or', ';', ',', '.'
]
#: First tokens to remove from entity matches
STRIP_START = [
'anhydrous', 'elemental', 'amorphous', 'conjugated', 'colloidal', 'activated', 'water-soluble', 'total',
'superparamagnetic', 'molecular', 'high-density', 'synthetic', 'low-density', 'long-chain', 'fused', 'radioactive',
'reduced', 'anatase', 'dextran', ')', 'trisubstituted', 'deposited', 'herbicide', 'antagonist', 'agonist', 'and',
'or', 'metallic', 'embryotoxic', 'monoclinic'
]
#: Disallowed tokens in chemical entity mentions (discard if any single token has exact case-insensitive match)
STOP_TOKENS = {
'gene', 'inhibitor', 'genetical', 'human', 'recombinant', 'recombination', 'adenovirus', 'bovine', 'chicken',
'sheep', 'pig', 'horse',