3
NqiI                 @   s   d Z ddlZddlZddlZddlmZ dgZejdZejdZ	ejdZ
ejdZejd	Zejd
ZejdZejdZejdZejdZejdejZejdZejdZG dd dejZdS )zA parser for HTML and XHTML.    N)unescape
HTMLParserz[&<]z
&[a-zA-Z#]z%&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]z)&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]z	<[a-zA-Z]z
</[a-zA-Z]>z--\s*>z+([a-zA-Z][^\t\n\r\f />\x00]*)(?:\s|/(?!>))*z]((?<=[\'"\s/])[^\s/>][^\s/=>]*)(\s*=+\s*(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?(?:\s|/(?!>))*aF  
  <[a-zA-Z][^\t\n\r\f />\x00]*       # tag name
  (?:[\s/]*                          # optional whitespace before attribute name
    (?:(?<=['"\s/])[^\s/>][^\s/=>]*  # attribute name
      (?:\s*=+\s*                    # value indicator
        (?:'[^']*'                   # LITA-enclosed value
          |"[^"]*"                   # LIT-enclosed value
          |(?!['"])[^>\s]*           # bare value
         )
         (?:\s*,)*                   # possibly followed by a comma
       )?(?:\s|/(?!>))*
     )*
   )?
  \s*                                # trailing whitespace
z#</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>c               @   s   e Zd ZdZd:ZddddZdd	 Zd
d Zdd ZdZ	dd Z
dd Zdd Zdd Zdd Zd;ddZdd Zdd Zd d! Zd"d# Zd$d% Zd&d' Zd(d) Zd*d+ Zd,d- Zd.d/ Zd0d1 Zd2d3 Zd4d5 Zd6d7 Zd8d9 ZdS )<r   aE  Find tags and other markup and call handler functions.

    Usage:
        p = HTMLParser()
        p.feed(data)
        ...
        p.close()

    Start tags are handled by calling self.handle_starttag() or
    self.handle_startendtag(); end tags by self.handle_endtag().  The
    data between tags is passed from the parser to the derived class
    by calling self.handle_data() with the data as argument (the data
    may be split up in arbitrary chunks).  If convert_charrefs is
    True the character references are converted automatically to the
    corresponding Unicode character (and self.handle_data() is no
    longer split in chunks), otherwise they are passed by calling
    self.handle_entityref() or self.handle_charref() with the string
    containing respectively the named or numeric reference as the
    argument.
    scriptstyleT)convert_charrefsc            C   s   || _ | j  dS )zInitialize and reset this instance.

        If convert_charrefs is True (the default), all character references
        are automatically converted to the corresponding Unicode characters.
        N)r   reset)selfr    r
   !/usr/lib/python3.6/html/parser.py__init__X   s    zHTMLParser.__init__c             C   s(   d| _ d| _t| _d| _tjj|  dS )z1Reset this instance.  Loses all unprocessed data. z???N)rawdatalasttaginteresting_normalinteresting
cdata_elem_markupbase
ParserBaser   )r	   r
   r
   r   r   a   s
    zHTMLParser.resetc             C   s   | j | | _ | jd dS )zFeed data to the parser.

        Call this as often as you want, with as little or as much text
        as you want (may include '\n').
        r   N)r   goahead)r	   datar
   r
   r   feedi   s    zHTMLParser.feedc             C   s   | j d dS )zHandle any buffered data.   N)r   )r	   r
   r
   r   closer   s    zHTMLParser.closeNc             C   s   | j S )z)Return full source of start tag: '<...>'.)_HTMLParser__starttag_text)r	   r
   r
   r   get_starttag_textx   s    zHTMLParser.get_starttag_textc             C   s$   |j  | _tjd| j tj| _d S )Nz</\s*%s\s*>)lowerr   recompileIr   )r	   elemr
   r
   r   set_cdata_mode|   s    
zHTMLParser.set_cdata_modec             C   s   t | _d | _d S )N)r   r   r   )r	   r
   r
   r   clear_cdata_mode   s    zHTMLParser.clear_cdata_modec             C   sB  | j }d}t|}x||k r| jr|| j r||jd|}|dk r|jdt||d }|dkrvtjdj	|| rvP |}n(| j
j	||}|r|j }n| jrP |}||k r| jr| j r| jt|||  n| j|||  | j||}||krP |j}|d|r4tj||r&| j|}	n|d|r>| j|}	nr|d|rV| j|}	nZ|d|rn| j|}	nB|d	|r| j|}	n*|d
 |k s|r| jd |d
 }	nP |	dk r&|sP tj||rԐnN|d|r$|d |kr| jd n&tj||r
n| j||d d    n|d|r||}x.dD ]&}
|j|
|d r:|t|
8 }P q:W | j||d |  n|d|r| j||d d   n|||d  j dkr| j||d d   nP|d	|r| j||d d   n,|d|r| j||d d   ntd|}	| j||	}q|d|rtj||}|r|j  dd }| j!| |j" }	|d|	d
 s|	d
 }	| j||	}qn:d||d  kr| j|||d   | j||d }P q|d|rt#j||}|r8|j d
}| j$| |j" }	|d|	d
 s*|	d
 }	| j||	}qt%j||}|r|r|j  ||d  kr|j" }	|	|kr||}	| j||d
 }P n,|d
 |k r| jd | j||d
 }nP qdstdqW |r0||k r0| j r0| jr| j r| jt|||  n| j|||  | j||}||d  | _ d S )Nr   <&"   z[\s;]z</z<!--z<?z<!r      --!---   z	<![CDATA[   	   z	<!doctypezwe should not get here!z&#;zinteresting.search() lied)r'   r(   r)   )&r   lenr   r   findrfindmaxr   r   searchr   starthandle_datar   Z	updatepos
startswithstarttagopenmatchparse_starttagparse_endtagparse_commentparse_piparse_html_declaration
endtagopenhandle_commentendswithunknown_declr   handle_decl	handle_piAssertionErrorcharrefgrouphandle_charrefend	entityrefhandle_entityref
incomplete)r	   rH   r   injZampposr8   r6   ksuffixnamer
   r
   r   r      s    
 








zHTMLParser.goaheadc             C   s   | j }|||d  dks"td|||d  dkr@| j|S |||d  dkr^| j|S |||d  j d	kr|jd
|d }|dkrdS | j||d |  |d S | j|S d S )Nr&   z<!z+unexpected call to parse_html_declaration()r*   z<!--r+   z<![r,   z	<!doctyper   r   r.   r.   )r   rD   r;   Zparse_marked_sectionr   r0   rB   parse_bogus_comment)r	   rL   r   gtposr
   r
   r   r=     s    

z!HTMLParser.parse_html_declarationr   c             C   s`   | j }|||d  dks"td|jd|d }|dkr>d	S |rX| j||d |  |d S )
Nr&   <!</z"unexpected call to parse_comment()r   r   )rT   rU   r.   r.   )r   rD   r0   r?   )r	   rL   Zreportr   posr
   r
   r   rR   (  s    zHTMLParser.parse_bogus_commentc             C   sd   | j }|||d  dks"tdtj||d }|s:dS |j }| j||d |  |j }|S )Nr&   z<?zunexpected call to parse_pi()r   r.   )r   rD   picloser3   r4   rC   rH   )r	   rL   r   r8   rN   r
   r
   r   r<   4  s    zHTMLParser.parse_pic             C   s  d | _ | j|}|dk r|S | j}||| | _ g }tj||d }|sPtd|j }|jdj  | _	}x||k r0t
j||}|sP |jddd\}	}
}|
sd }n^|d d d  ko|dd  kn  p|d d d  ko|dd  kn  r|dd }|rt|}|j|	j |f |j }qnW ||| j }|dkr| j \}}d
| j kr|| j jd
 }t| j | j jd
 }n|t| j  }| j|||  |S |jd	r| j|| n"| j|| || jkr| j| |S )Nr   r   z#unexpected call to parse_starttag()r&   r+   '"r   />
r.   r.   r.   )r   rZ   )r   check_for_whole_start_tagr   tagfind_tolerantr8   rD   rH   rF   r   r   attrfind_tolerantr   appendstripZgetposcountr/   r1   r5   r@   handle_startendtaghandle_starttagCDATA_CONTENT_ELEMENTSr!   )r	   rL   endposr   attrsr8   rO   tagmZattrnamerestZ	attrvaluerH   linenooffsetr
   r
   r   r9   @  sR    
(*

zHTMLParser.parse_starttagc             C   s   | j }tj||}|r|j }|||d  }|dkr>|d S |dkr~|jd|rZ|d S |jd|rjd	S ||krv|S |d S |dkrd
S |dkrdS ||kr|S |d S tdd S )Nr   r   /z/>r&   r   z6abcdefghijklmnopqrstuvwxyz=/ABCDEFGHIJKLMNOPQRSTUVWXYZzwe should not get here!r.   r.   r.   )r   locatestarttagend_tolerantr8   rH   r6   rD   )r	   rL   r   rh   rN   nextr
   r
   r   r\   s  s.    z$HTMLParser.check_for_whole_start_tagc             C   s2  | j }|||d  dks"tdtj||d }|s:dS |j }tj||}|s| jd k	rr| j|||  |S t	j||d }|s|||d  dkr|d S | j
|S |jdj }|jd|j }| j| |d S |jdj }| jd k	r|| jkr| j|||  |S | j|j  | j  |S )	Nr&   z</zunexpected call to parse_endtagr   r+   z</>r   r.   )r   rD   	endendtagr3   rH   
endtagfindr8   r   r5   r]   rR   rF   r   r0   handle_endtagr"   )r	   rL   r   r8   rS   Z	namematchZtagnamer    r
   r
   r   r:     s8    


zHTMLParser.parse_endtagc             C   s   | j || | j| d S )N)rc   rq   )r	   rg   rf   r
   r
   r   rb     s    zHTMLParser.handle_startendtagc             C   s   d S )Nr
   )r	   rg   rf   r
   r
   r   rc     s    zHTMLParser.handle_starttagc             C   s   d S )Nr
   )r	   rg   r
   r
   r   rq     s    zHTMLParser.handle_endtagc             C   s   d S )Nr
   )r	   rQ   r
   r
   r   rG     s    zHTMLParser.handle_charrefc             C   s   d S )Nr
   )r	   rQ   r
   r
   r   rJ     s    zHTMLParser.handle_entityrefc             C   s   d S )Nr
   )r	   r   r
   r
   r   r5     s    zHTMLParser.handle_datac             C   s   d S )Nr
   )r	   r   r
   r
   r   r?     s    zHTMLParser.handle_commentc             C   s   d S )Nr
   )r	   Zdeclr
   r
   r   rB     s    zHTMLParser.handle_declc             C   s   d S )Nr
   )r	   r   r
   r
   r   rC     s    zHTMLParser.handle_pic             C   s   d S )Nr
   )r	   r   r
   r
   r   rA     s    zHTMLParser.unknown_declc             C   s   t jdtdd t|S )NzZThe unescape method is deprecated and will be removed in 3.5, use html.unescape() instead.r&   )
stacklevel)warningswarnDeprecationWarningr   )r	   sr
   r
   r   r     s    
zHTMLParser.unescape)r   r   )r   )__name__
__module____qualname____doc__rd   r   r   r   r   r   r   r!   r"   r   r=   rR   r<   r9   r\   r:   rb   rc   rq   rG   rJ   r5   r?   rB   rC   rA   r   r
   r
   r
   r   r   @   s:   		 
3"()rz   r   rs   r   Zhtmlr   __all__r   r   rK   rI   rE   r7   r>   rW   Zcommentcloser]   r^   VERBOSErm   ro   rp   r   r   r
   r
   r
   r   <module>   s*   












