#!/usr/bin/env python # -*- coding: utf-8 -*- ''' Context-independent xHTML pair matcher Use method match(html, start_ix) to find matching pair. If pair was found, this function returns a list of indexes where tag pair starts and ends. If pair wasn't found, None will be returned. The last matched (or unmatched) result is saved in last_match dictionary for later use. @author: Sergey Chikuyonok (serge.che@gmail.com) ''' import re start_tag = r'<([\w\:\-]+)((?:\s+[\w\-:]+(?:\s*=\s*(?:(?:"[^"]*")|(?:\'[^\']*\')|[^>\s]+))?)*)\s*(\/?)>' end_tag = r'<\/([\w\:\-]+)[^>]*>' attr = r'([\w\-:]+)(?:\s*=\s*(?:(?:"((?:\\.|[^"])*)")|(?:\'((?:\\.|[^\'])*)\')|([^>\s]+)))?' "Last matched HTML pair" last_match = { 'opening_tag': None, # Tag() or Comment() object 'closing_tag': None, # Tag() or Comment() object 'start_ix': -1, 'end_ix': -1 } cur_mode = 'xhtml' "Current matching mode" def set_mode(new_mode): global cur_mode if new_mode != 'html': new_mode = 'xhtml' cur_mode = new_mode def make_map(elems): """ Create dictionary of elements for faster searching @param elems: Elements, separated by comma @type elems: str """ obj = {} for elem in elems.split(','): obj[elem] = True return obj # Empty Elements - HTML 4.01 empty = make_map("area,base,basefont,br,col,frame,hr,img,input,isindex,link,meta,param,embed"); # Block Elements - HTML 4.01 block = make_map("address,applet,blockquote,button,center,dd,dir,div,dl,dt,fieldset,form,frameset,hr,iframe,isindex,li,map,menu,noframes,noscript,object,ol,p,pre,script,table,tbody,td,tfoot,th,thead,tr,ul"); # Inline Elements - HTML 4.01 inline = make_map("a,abbr,acronym,applet,b,basefont,bdo,big,br,button,cite,code,del,dfn,em,font,i,iframe,img,input,ins,kbd,label,map,object,q,s,samp,select,small,span,strike,strong,sub,sup,textarea,tt,u,var"); # Elements that you can, intentionally, leave open # (and which close themselves) close_self = make_map("colgroup,dd,dt,li,options,p,td,tfoot,th,thead,tr"); # Attributes that have their values filled in disabled="disabled" fill_attrs = make_map("checked,compact,declare,defer,disabled,ismap,multiple,nohref,noresize,noshade,nowrap,readonly,selected"); #Special Elements (can contain anything) # serge.che: parsing data inside elements is a "feature" special = make_map("style"); class Tag(): """Matched tag""" def __init__(self, match, ix): """ @type match: MatchObject @param match: Matched HTML tag @type ix: int @param ix: Tag's position """ global cur_mode name = match.group(1).lower() self.name = name self.full_tag = match.group(0) self.start = ix self.end = ix + len(self.full_tag) self.unary = ( len(match.groups()) > 2 and bool(match.group(3)) ) or (name in empty and cur_mode == 'html') self.type = 'tag' self.close_self = (name in close_self and cur_mode == 'html') class Comment(): "Matched comment" def __init__(self, start, end): self.start = start self.end = end self.type = 'comment' def make_range(opening_tag=None, closing_tag=None, ix=0): """ Makes selection ranges for matched tag pair @type opening_tag: Tag @type closing_tag: Tag @type ix: int @return list """ start_ix, end_ix = -1, -1 if opening_tag and not closing_tag: # unary element start_ix = opening_tag.start end_ix = opening_tag.end elif opening_tag and closing_tag: # complete element if (opening_tag.start < ix and opening_tag.end > ix) or (closing_tag.start <= ix and closing_tag.end > ix): start_ix = opening_tag.start end_ix = closing_tag.end; else: start_ix = opening_tag.end end_ix = closing_tag.start return start_ix, end_ix def save_match(opening_tag=None, closing_tag=None, ix=0): """ Save matched tag for later use and return found indexes @type opening_tag: Tag @type closing_tag: Tag @type ix: int @return list """ last_match['opening_tag'] = opening_tag; last_match['closing_tag'] = closing_tag; last_match['start_ix'], last_match['end_ix'] = make_range(opening_tag, closing_tag, ix) return last_match['start_ix'] != -1 and (last_match['start_ix'], last_match['end_ix']) or (None, None) def match(html, start_ix, mode='xhtml'): """ Search for matching tags in html, starting from start_ix position. The result is automatically saved in last_match property """ return _find_pair(html, start_ix, mode, save_match) def find(html, start_ix, mode='xhtml'): """ Search for matching tags in html, starting from start_ix position. """ return _find_pair(html, start_ix, mode) def get_tags(html, start_ix, mode='xhtml'): """ Search for matching tags in html, starting from start_ix position. The difference between match function itself is that get_tags method doesn't save matched result in last_match property and returns array of opening and closing tags This method is generally used for lookups """ return _find_pair(html, start_ix, mode, lambda op, cl=None, ix=0: (op, cl) if op and op.type == 'tag' else None) def _find_pair(html, start_ix, mode='xhtml', action=make_range): """ Search for matching tags in html, starting from start_ix position @param html: Code to search @type html: str @param start_ix: Character index where to start searching pair (commonly, current caret position) @type start_ix: int @param action: Function that creates selection range @type action: function @return: list """ forward_stack = [] backward_stack = [] opening_tag = None closing_tag = None html_len = len(html) set_mode(mode) def has_match(substr, start=None): if start is None: start = ix return html.find(substr, start) == start def find_comment_start(start_pos): while start_pos: if html[start_pos] == '<' and has_match('') + ix + 3; if ix < start_ix and end_ix >= start_ix: return action(Comment(ix, end_ix)) elif ch == '-' and has_match('-->'): # found comment end # search left until comment start is reached ix = find_comment_start(ix) ix -= 1 if not opening_tag: return action(None) # find closing tag if not closing_tag: ix = start_ix while ix < html_len: ch = html[ix] if ch == '<': check_str = html[ix:] m = re.match(start_tag, check_str) if m: # found opening tag tmp_tag = Tag(m, ix); if not tmp_tag.unary: forward_stack.append(tmp_tag) else: m = re.match(end_tag, check_str) if m: #found closing tag tmp_tag = Tag(m, ix); if forward_stack and forward_stack[-1].name == tmp_tag.name: forward_stack.pop() else: # found matched closing tag closing_tag = tmp_tag; break elif has_match('') + 3 continue elif ch == '-' and has_match('-->'): # looks like cursor was inside comment with invalid HTML if not forward_stack or forward_stack[-1].type != 'comment': end_ix = ix + 3 return action(Comment( find_comment_start(ix), end_ix )) ix += 1 return action(opening_tag, closing_tag, start_ix)