""" Iterator based sre token scanner """ import sre_parse, sre_compile, sre_constants from sre_constants import BRANCH, SUBPATTERN from re import VERBOSE, MULTILINE, DOTALL import re __all__ = ['Scanner', 'pattern'] FLAGS = (VERBOSE | MULTILINE | DOTALL) class Scanner(object): def __init__(self, lexicon, flags=FLAGS): self.actions = [None] # combine phrases into a compound pattern s = sre_parse.Pattern() s.flags = flags p = [] for idx, token in enumerate(lexicon): phrase = token.pattern try: subpattern = sre_parse.SubPattern(s, [(SUBPATTERN, (idx + 1, sre_parse.parse(phrase, flags)))]) except sre_constants.error: raise p.append(subpattern) self.actions.append(token) p = sre_parse.SubPattern(s, [(BRANCH, (None, p))]) self.scanner = sre_compile.compile(p) def iterscan(self, string, idx=0, context=None): """ Yield match, end_idx for each match """ match = self.scanner.scanner(string, idx).match actions = self.actions lastend = idx end = len(string) while True: m = match() if m is None: break matchbegin, matchend = m.span() if lastend == matchend: break action = actions[m.lastindex] if action is not None: rval, next_pos = action(m, context) if next_pos is not None and next_pos != matchend: # "fast forward" the scanner matchend = next_pos match = self.scanner.scanner(string, matchend).match yield rval, matchend lastend = matchend def pattern(pattern, flags=FLAGS): def decorator(fn): fn.pattern = pattern fn.regex = re.compile(pattern, flags) return fn return decorator