pax_global_header00006660000000000000000000000064125723753510014524gustar00rootroot0000000000000052 comment=0668112a15b9e8e9355a1261040c36b4a6034020 reparser-1.4.3/000077500000000000000000000000001257237535100133545ustar00rootroot00000000000000reparser-1.4.3/.gitignore000066400000000000000000000000561257237535100153450ustar00rootroot00000000000000__pycache__/ *.py[cod] build/ dist/ MANIFEST reparser-1.4.3/LICENSE000066400000000000000000000020701257237535100143600ustar00rootroot00000000000000The MIT License (MIT) Copyright (c) 2015 Michal Krenek Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. reparser-1.4.3/MANIFEST.in000066400000000000000000000000431257237535100151070ustar00rootroot00000000000000include LICENSE include README.rst reparser-1.4.3/PKGBUILD000066400000000000000000000010621257237535100144770ustar00rootroot00000000000000# Maintainer: Michal Krenek (Mikos) pkgname=python-reparser _pkgname=reparser pkgver=1.4.3 pkgrel=1 pkgdesc="Simple regex-based lexer/parser for inline markup" arch=('any') url="https://github.com/xmikos/reparser" license=('MIT') depends=('python') makedepends=('python-setuptools') source=(https://github.com/xmikos/reparser/archive/v$pkgver.tar.gz) build() { cd "$srcdir/${_pkgname}-$pkgver" python setup.py build } package() { cd "$srcdir/${_pkgname}-$pkgver" python setup.py install --root="$pkgdir" } # vim:set ts=2 sw=2 et: reparser-1.4.3/README.rst000066400000000000000000000050341257237535100150450ustar00rootroot00000000000000ReParser ======== Simple regex-based lexer/parser for inline markup Requirements ------------ - Python 3 Usage ----- Example:: import re from pprint import pprint from reparser import Parser, Token, MatchGroup boundary_chars = r'\s`!()\[\]{{}};:\'".,<>?«»“”‘’*_~=' b_left = r'(?:(?<=[' + boundary_chars + r'])|(?<=^))' # Lookbehind b_right = r'(?:(?=[' + boundary_chars + r'])|(?=$))' # Lookahead markdown_start = b_left + r'(?.+?)\]\((?P.+?)\)' newline = r'\n|\r\n' url_proto_regex = re.compile(r'(?i)^[a-z][\w-]+:/{1,3}') def markdown(tag): """Return sequence of start and end regex patterns for simple Markdown tag""" return (markdown_start.format(tag=tag), markdown_end.format(tag=tag)) def url_complete(url): """If URL doesn't start with protocol, prepend it with http://""" return url if url_proto_regex.search(url) else 'http://' + url tokens = [ Token('bi1', *markdown(r'\*\*\*'), is_bold=True, is_italic=True), Token('bi2', *markdown(r'___'), is_bold=True, is_italic=True), Token('b1', *markdown(r'\*\*'), is_bold=True), Token('b2', *markdown(r'__'), is_bold=True), Token('i1', *markdown(r'\*'), is_italic=True), Token('i2', *markdown(r'_'), is_italic=True), Token('pre3', *markdown(r'```'), skip=True), Token('pre2', *markdown(r'``'), skip=True), Token('pre1', *markdown(r'`'), skip=True), Token('s', *markdown(r'~~'), is_strikethrough=True), Token('u', *markdown(r'=='), is_underline=True), Token('link', markdown_link, text=MatchGroup('link'), link_target=MatchGroup('url', func=url_complete)), Token('br', newline, text='\n', segment_type="LINE_BREAK") ] parser = Parser(tokens) text = ('Hello **bold** world!\n' 'You can **try *this* awesome** [link](www.eff.org).') segments = parser.parse(text) pprint([(segment.text, segment.params) for segment in segments]) Output:: [('Hello ', {}), ('bold', {'is_bold': True}), (' world!', {}), ('\n', {'segment_type': 'LINE_BREAK'}), ('You can ', {}), ('try ', {'is_bold': True}), ('this', {'is_bold': True, 'is_italic': True}), (' awesome', {'is_bold': True}), (' ', {}), ('link', {'link_target': 'http://www.eff.org'}), ('.', {})] reparser-1.4.3/reparser.py000066400000000000000000000144631257237535100155610ustar00rootroot00000000000000import re, enum # Precompiled regex for matching named groups in regex patterns group_regex = re.compile(r'\?P<(.+?)>') class Segment: """Segment of parsed text""" def __init__(self, text, token=None, match=None, **params): self.text = text self.params = params if token and match: self.update_text(token, match) self.update_params(token, match) def update_text(self, token, match): """Update text from results of regex match""" if isinstance(self.text, MatchGroup): self.text = self.text.get_group_value(token, match) def update_params(self, token, match): """Update dict of params from results of regex match""" for k, v in self.params.items(): if isinstance(v, MatchGroup): self.params[k] = v.get_group_value(token, match) class Token: """Definition of token which should be parsed from text""" def __init__(self, name, pattern_start, pattern_end=None, text=None, skip=False, **params): self.name = name self.group_start = '{}_start'.format(self.name) self.group_end = '{}_end'.format(self.name) if pattern_end else None self.pattern_start = self.modify_pattern(pattern_start, self.group_start) self.pattern_end = self.modify_pattern(pattern_end, self.group_end) if pattern_end else None self.text = text self.skip = skip self.params = params def modify_pattern(self, pattern, group): """Rename groups in regex pattern and enclose it in named group""" pattern = group_regex.sub(r'?P<{}_\1>'.format(self.name), pattern) return r'(?P<{}>{})'.format(group, pattern) class MatchGroup: """Name of regex group which should be replaced by its value when token is parsed""" def __init__(self, group, func=None): self.group = group self.func = func def get_group_value(self, token, match): """Return value of regex match for the specified group""" try: value = match.group('{}_{}'.format(token.name, self.group)) except IndexError: value = '' return self.func(value) if callable(self.func) else value class MatchType(enum.Enum): """Type of token matched by regex""" start = 1 end = 2 single = 3 class Parser: """Simple regex-based lexer/parser for inline markup""" def __init__(self, tokens): self.tokens = tokens self.regex = self.build_regex(tokens) self.groups = self.build_groups(tokens) def preprocess(self, text): """Preprocess text before parsing (should be reimplemented by subclass)""" return text def postprocess(self, text): """Postprocess text after parsing (should be reimplemented by subclass)""" return text def build_regex(self, tokens): """Build compound regex from list of tokens""" patterns = [] for token in tokens: patterns.append(token.pattern_start) if token.pattern_end: patterns.append(token.pattern_end) return re.compile('|'.join(patterns), re.DOTALL) def build_groups(self, tokens): """Build dict of groups from list of tokens""" groups = {} for token in tokens: match_type = MatchType.start if token.group_end else MatchType.single groups[token.group_start] = (token, match_type) if token.group_end: groups[token.group_end] = (token, MatchType.end) return groups def get_matched_token(self, match): """Find which token has been matched by compound regex""" match_groupdict = match.groupdict() for group in self.groups: if match_groupdict[group] is not None: token, match_type = self.groups[group] return (token, match_type, group) def get_params(self, token_stack): """Get params from stack of tokens""" params = {} for token in token_stack: params.update(token.params) return params def remove_token(self, token_stack, token): """Remove last occurance of token from stack""" token_stack.reverse() try: token_stack.remove(token) retval = True except ValueError: retval = False token_stack.reverse() return retval def parse(self, text): """Parse text to obtain list of Segments""" text = self.preprocess(text) token_stack = [] last_pos = 0 # Iterate through all matched tokens for match in self.regex.finditer(text): # Find which token has been matched by regex token, match_type, group = self.get_matched_token(match) # Get params from stack of tokens params = self.get_params(token_stack) # Should we skip interpreting tokens? skip = token_stack[-1].skip if token_stack else False # Check for end token first if match_type == MatchType.end: if not skip or token_stack[-1] == token: removed = self.remove_token(token_stack, token) if removed: skip = False else: skip = True if not skip: # Append text preceding matched token start_pos = match.start(group) if start_pos > last_pos: yield Segment(self.postprocess(text[last_pos:start_pos]), **params) # Actions specific for start token or single token if match_type == MatchType.start: token_stack.append(token) elif match_type == MatchType.single: single_params = params.copy() single_params.update(token.params) single_text = token.text if token.text is not None else match.group(group) yield Segment(single_text, token=token, match=match, **single_params) # Move last position pointer to the end of matched token last_pos = match.end(group) # Append anything that's left if last_pos < len(text): params = self.get_params(token_stack) yield Segment(self.postprocess(text[last_pos:]), **params) reparser-1.4.3/setup.py000077500000000000000000000015741257237535100151000ustar00rootroot00000000000000#!/usr/bin/env python import sys from setuptools import setup install_requires = [] if sys.version_info < (3, 4): install_requires.append('enum34') setup( name="ReParser", version="1.4.3", description="Simple regex-based lexer/parser for inline markup", author="Michal Krenek (Mikos)", author_email="m.krenek@gmail.com", url="https://github.com/xmikos/reparser", license="MIT", py_modules=["reparser"], install_requires=install_requires, classifiers=[ "Development Status :: 4 - Beta", "Intended Audience :: Developers", "License :: OSI Approved :: MIT License", "Natural Language :: English", "Operating System :: OS Independent", "Programming Language :: Python :: 3", "Topic :: Software Development :: Libraries :: Python Modules", "Topic :: Text Processing :: Markup" ] )