3 * Original code by Erik John Resig (ejohn.org)
\r
4 * http://ejohn.org/blog/pure-javascript-html-parser/
\r
8 var X_HTMLParser_CHARS = {
\r
9 'A':1,'B':1,'C':1,'D':1,'E':1,'F':1,'G':1,'H':1,'I':1,'J':1,'K':1,'L':1,'M':1,'N':1,'O':1,'P':1,'Q':1,'R':1,'S':1,'T':1,'U':1,'V':1,'W':1,'X':1,'Y':1,'Z':1,
\r
10 'a':2,'b':2,'c':2,'d':2,'e':2,'f':2,'g':2,'h':2,'i':2,'j':2,'k':2,'l':2,'m':2,'n':2,'o':2,'p':2,'q':2,'r':2,'s':2,'t':2,'u':2,'v':2,'w':2,'x':2,'y':2,'z':2,
\r
11 '!':1,// "0" ': 4, "1" : 4, "2" : 4, "3" : 4, "4" : 4, "5" : 4, "6" : 4, "7" : 4, "8" : 4, "9" : 4, closure compiler で minify すると ie4 で error、eval使う
\r
13 '\t' : 16, '\r\n' : 16, '\r' : 16, '\n' : 16, '\f' : 16, '\b' : 16, ' ' : 16
\r
16 // Empty Elements - HTML 4.01
\r
19 // Block Elements - HTML 4.01
\r
20 X_HTMLParser_block = {'ADDRESS':1,'APPLET':1,'BLOCKQUOTE':1,'BUTTON':1,'CENTER':1,'DD':1,'DEL':1,'DIR':1,'DIV':1,'DL':1,'DT':1,'FIELDSET':1,'FORM':1,'FRAMESET':1,'HR':1,'IFRAME':1,'INS':1,
\r
21 'ISINDEX':1,'LI':1,'MAP':1,'MENU':1,'NOFRAMES':1,'NOSCRIPT':1,'OBJECT':1,'OL':1,'P':1,'PRE':1,'SCRIPT':1,'TABLE':1,'TBODY':1,'TD':1,'TFOOT':1,'TH':1,'THEAD':1,'TR':1,'UL':1 },
\r
22 // Inline Elements - HTML 4.01
\r
23 X_HTMLParser_inline = {/*'A':1,*/'ABBR':1,'ACRONYM':1,'APPLET':1,'B':1,'BASEFONT':1,'BDO':1,'BIG':1,'BR':1,'BUTTON':1,'CITE':1,'CODE':1,'DEL':1,'DFN':1,'EM':1,'FONT':1,'I':1,'IFRAME':1,'IMG':1,
\r
24 'INPUT':1,'INS':1,'KBD':1,'LABEL':1,'MAP':1,'OBJECT':1,'Q':1,'S':1,'SAMP':1,'SCRIPT':1,'SELECT':1,'SMALL':1,'SPAN':1,'STRIKE':1,'STRONG':1,'SUB':1,'SUP':1,'TEXTAREA':1,'TT':1,'U':1,'VAR':1},
\r
25 // Elements that you can,' intentionally,' leave open
\r
26 // (and which close themselves)
\r
27 X_HTMLParser_closeSelf = {'OLGROUP':1,'DD':1,'DT':1,'LI':1,'OPTIONS':1,'P':1,'TBODY':1,'TD':1,'TFOOT':1,'TH':1,'THEAD':1,'TR':1}, // add tbody
\r
29 X_HTMLParser_sisters = {
\r
30 'TH' : { 'TD' : 1 },
\r
31 'TD' : { 'TH' : 1 },
\r
32 'DT' : { 'DD' : 1 },
\r
33 'DD' : { 'DT' : 1 },
\r
34 'COLGROUP' : { 'CAPTION' : 1 },
\r
35 'THEAD' : { 'CAPTION' : 1, 'COLGROUP' : 1 },
\r
36 'TFOOT' : { 'CAPTION' : 1, 'COLGROUP' : 1, 'THEAD' : 1, 'TBODY' : 1 },
\r
37 'TBODY' : { 'CAPTION' : 1, 'COLGROUP' : 1, 'THEAD' : 1, 'TFOOT' : 1 }
\r
40 * http://www.tohoho-web.com/html/tbody.htm
\r
41 * HTML4.01では、ヘッダとフッタを先読みして表示するために、<tbody> よりも <tfoot> の方を先に記述しなくてはならないと定義されています。
\r
42 * IE5.0 などでは HEAD → BODY → FOOT の順に表示するのですが、
\r
43 * <tfoot> に未対応の古いブラウザでは、HEAD → FOOT → BODY の順に表示されてしまいます。
\r
44 * また、HTML5 では、<tfoot> と <tbody> の順番はどちらでもよいことになりました。
\r
47 // Attributes that have their values filled in disabled="disabled"
\r
49 // Special Elements (can contain anything)
\r
50 X_HTMLParser_special = { 'SCRIPT' : 1, 'STYLE' : 1, 'PLAINTEXT' : 1, 'XMP' : 1, 'TEXTAREA' : 1 },
\r
52 X_HTMLParser_skipFixNesting = false;
\r
54 function X_HTMLParser_exec( html, handler, async ){
\r
55 var special = X_HTMLParser_special,
\r
56 //plainText = X_HTMLParser_plainText,
\r
57 startTime = async && X_Timer_now(),
\r
58 stack = async ? async[ 1 ] : [],
\r
60 chars, last, text, index;
\r
64 last = stack[ stack.length - 1 ];
\r
66 // Make sure we're not in a script or style element
\r
67 if ( last && special[ handler.isXML ? last.toUpperCase() : last ] === 1 ) {
\r
68 if( 0 <= ( index = html.toUpperCase().indexOf( '</' + ( handler.isXML ? last.toUpperCase() : last ) ) ) ){
\r
69 handler.chars( html.substring( 0, index ) );
\r
70 if( index = X_HTMLParser__parseEndTag( stack, handler, html ) ){
\r
71 html = html.substring( index );
\r
73 handler.chars( html );
\r
77 handler.chars( html );
\r
82 if ( html.indexOf("<!--") === 0 ) {
\r
83 if ( 0 < ( index = html.indexOf("-->") ) ) {
\r
84 handler.comment( html.substring( 4, index ) );
\r
85 html = html.substring( index + 3 );
\r
90 } else if ( html.indexOf("</") === 0 ) {
\r
91 if ( 2 < ( index = X_HTMLParser__parseEndTag( stack, handler, html ) ) ) {
\r
92 html = html.substring( index );
\r
97 } else if ( html.indexOf("<") === 0 ) {
\r
98 if( index = X_HTMLParser__parseStartTag( stack, last, handler, html ) ){
\r
99 html = html.substring( index );
\r
102 if( index === false ){
\r
108 index = html.indexOf("<");
\r
110 text = index < 0 ? html : html.substring( 0, index );
\r
111 html = index < 0 ? '' : html.substring( index );
\r
113 handler.chars( text );
\r
118 if( html === lastHtml ){
\r
119 handler.err( html );
\r
123 if( async && startTime + 15 <= X_Timer_now() && html ){
\r
124 handler.progress( 1 - html.length / async[ 0 ] );
\r
125 X_Timer_once( 0, X_HTMLParser_exec, [ html, handler, async ] );
\r
132 // Clean up any remaining tags
\r
133 X_HTMLParser_parseEndTag( stack, handler );
\r
135 async && handler.complete();
\r
138 function X_HTMLParser__parseStartTag( stack, last, handler, html ){
\r
139 var alphabets = X_HTMLParser_CHARS,
\r
140 whiteSpace = X_HTMLParser_CHARS,
\r
141 saveAttr = X_HTMLParser_saveAttr,
\r
142 uri = X_Dom_DTD_ATTR_VAL_IS_URI,
\r
147 tagName, empty = false,
\r
148 chr, start, attrName, quot, escape, tagUpper;
\r
150 while( i < l && phase < 9 ){
\r
151 chr = html.charAt( i );
\r
154 chr === '<' && ( ++phase );
\r
156 case 1 : // タグ名の開始を待つ
\r
157 ( alphabets[ chr ] & 3 ) && ( ++phase && ( start = i ) );
\r
159 case 2 : // タグ名の終わりの空白文字を待つ
\r
160 ( whiteSpace[ chr ] & 16 ) ?
\r
161 ( ++phase && ( tagName = html.substring( start, i ) ) ) :
\r
162 ( chr === '>' || ( empty = html.substr( i, 2 ) === '/>' ) ) &&
\r
163 ( ( tagName = html.substring( start, i ) ) && ( phase = 9 ) );
\r
165 case 3 : // 属性名の開始を待つ
\r
166 ( alphabets[ chr ] & 3 ) ?
\r
167 ( ++phase && ( start = i ) ) :
\r
168 ( chr === '>' || ( empty = html.substr( i, 2 ) === '/>' ) ) &&
\r
171 case 4 : // 属性名の終わりを待つ
\r
173 ( ( phase = 6 ) && ( attrName = html.substring( start, i ) ) ) :
\r
174 ( whiteSpace[ chr ] & 16 ) &&
\r
175 ( ( phase = 5 ) && ( attrName = html.substring( start, i ) ) );
\r
177 case 5 : // 属性の = または次の属性または htmlタグの閉じ
\r
178 ( whiteSpace[ chr ] & 16 ) ?// ie4 未対応の属性には cite = http:// となる
\r
180 ( alphabets[ chr ] & 3 ) ?
\r
181 ( ( phase = 4 ) && ( attrs[ attrs.length ] = attrName ) && ( start = i ) ) :
\r
184 ( chr === '>' || ( empty = html.substr( i, 2 ) === '/>' ) ) &&
\r
185 ( ( phase = 9 ) && ( attrs[ attrs.length ] = attrName ) );
\r
187 case 6 : // 属性値の開始 quot を待つ
\r
188 ( chr === '"' || chr === "'" ) ?
\r
189 ( ( phase = 7 ) && ( quot = chr ) && ( start = i + 1 ) ):
\r
190 !( whiteSpace[ chr ] & 16 ) &&
\r
191 ( ( phase = 8 ) && ( start = i ) ); // no quot
\r
193 case 7 : //属性値の閉じ quot を待つ
\r
194 !escape && ( chr === quot ) && ( phase = 3 ) && saveAttr( attrs, attrName, html.substring( start, i ) );
\r
196 case 8 : //閉じ quot のない属性の値
\r
197 ( whiteSpace[ chr ] & 16 ) ?
\r
198 ( ( phase = 3 ) && saveAttr( attrs, attrName, html.substring( start, i ) ) ) :
\r
200 ( ( phase = 9 ) && saveAttr( attrs, attrName, html.substring( start, i ) ) ) :
\r
201 !escape && !uri[ attrName ] && ( empty = html.substr( i, 2 ) === '/>' ) && // attr の val が uri で / で終わりかつ、未対応属性の場合
\r
205 escape = chr === '\\' && !escape; // \\\\ is not escape for "
\r
210 //if( X_HTMLParser_parseStartTag( stack, last, handler, tagName, attrs, empty, i ) === false ) return false;
\r
212 tagUpper = tagName.toUpperCase();
\r
214 if( !X_HTMLParser_skipFixNesting && X_HTMLParser_block[ tagUpper ] === 1 ){
\r
215 while( last && X_HTMLParser_inline[ handler.isXML ? last.toUpperCase() : last ] === 1 ){
\r
216 X_HTMLParser_parseEndTag( stack, handler, last );
\r
217 last = stack[ stack.length - 1 ];
\r
220 last && X_HTMLParser_closeSelf[ tagUpper ] === 1 &&
\r
221 ( last === tagName || ( X_HTMLParser_sisters[ tagUpper ] && X_HTMLParser_sisters[ tagUpper ][ handler.isXML ? last.toUpperCase() : last ] === 1 ) ) &&
\r
222 X_HTMLParser_parseEndTag( stack, handler, last );
\r
223 empty = empty || X_Dom_DTD_EMPTY[ tagUpper ];
\r
224 !empty && ( stack[ stack.length ] = handler.isXML ? tagName : tagUpper );
\r
226 if( handler.start( handler.isXML ? tagName : tagUpper, attrs, empty, i ) === false ) return false;
\r
233 function X_HTMLParser__parseEndTag( stack, handler, html ){
\r
234 var alphabets = X_HTMLParser_CHARS,
\r
235 whiteSpace = X_HTMLParser_CHARS,
\r
242 while( i < l && phase < 9 ){
\r
243 chr = html.charAt( i );
\r
246 html.substr( i, 2 ) === '</' && ( ++phase && ++i );
\r
248 case 1 : // タグ名の開始を待つ
\r
249 ( alphabets[ chr ] & 3 ) && ( ++phase && ( start = i ) );
\r
251 case 2 : // タグ名の終わりの空白文字を待つ
\r
252 ( whiteSpace[ chr ] & 16 ) && ( ++phase );
\r
253 ( chr === '>' ) && ( phase = 9 );
\r
254 ( phase !== 2 ) && ( tagName = html.substring( start, i ) );
\r
256 case 3 : // タグの終了を待つ
\r
257 chr === '>' && ( phase = 9 );
\r
263 X_HTMLParser_parseEndTag( stack, handler, handler.isXML ? tagName : tagName.toUpperCase() );
\r
269 function X_HTMLParser_saveAttr( attrs, name, value ){
\r
270 name = name.toLowerCase();
\r
271 value = X_Node_Attr_noValue[ name ] === 1 ? name : value;
\r
272 attrs[ attrs.length ] = {
\r
276 value.indexOf( '"' ) !== -1 ?
\r
277 value.split( '"' ).join( '\\"' ).split( '\\\\"' ).join( '\\"' ) :
\r
282 function X_HTMLParser_parseEndTag( stack, handler, tagName ) {
\r
283 var pos = 0, i = stack.length;
\r
284 // If no tag name is provided, clean shop
\r
286 // Find the closest opened tag of the same type
\r
288 for ( pos = i; 0 <= pos; )
\r
289 if ( stack[ --pos ] === tagName )
\r
293 // Close all the open elements, up the stack
\r
295 handler.end( stack[ --i ] );
\r
297 // Remove the open elements from the stack
\r
298 stack.length = pos;
\r
302 var X_HTMLParser_htmlStringToXNode = {
\r
306 err : function( html ){
\r
307 X_HTMLParser_htmlStringToXNode.flat.length = 0;
\r
308 !X_HTMLParser_htmlStringToXNode.ignoreError && X.Logger.warn( 'X_Dom_Parser() error ' + html );
\r
310 start : function( tagName, attrs, noChild, length ){
\r
312 nest = X_HTMLParser_htmlStringToXNode.nest,
\r
313 flat = X_HTMLParser_htmlStringToXNode.flat,
\r
315 attr, name, i, _attrs; //, toIndex;
\r
317 xnode = nest[ l - 1 ][ 'create' ]( tagName );
\r
319 xnode = flat[ flat.length ] = X_Doc_create( tagName );
\r
321 if( !noChild ) nest[ l ] = xnode;
\r
322 if( i = attrs.length ){
\r
325 if( attr = attrs[ --i ] ){
\r
326 if( X_Type_isString( attr ) ){
\r
328 _attrs[ name ] = true;
\r
331 _attrs[ name ] = attr.escaped;
\r
335 xnode[ 'attr' ]( _attrs );
\r
339 0 < X_HTMLParser_htmlStringToXNode.nest.length && ( --X_HTMLParser_htmlStringToXNode.nest.length );
\r
341 chars : function( text ){
\r
342 if( X_HTMLParser_htmlStringToXNode.nest.length ){
\r
343 X_HTMLParser_htmlStringToXNode.nest[ X_HTMLParser_htmlStringToXNode.nest.length - 1 ][ 'createText' ]( text );
\r
345 X_HTMLParser_htmlStringToXNode.flat[ X_HTMLParser_htmlStringToXNode.flat.length ] = X_Doc_createText( text );
\r
348 comment : X_emptyFunction
\r
351 function X_HtmlParser_parse( html, ignoreError ){
\r
352 var worker = X_HTMLParser_htmlStringToXNode, ret;
\r
354 worker.nest.length = 0;
\r
355 worker.ignoreError = ignoreError;
\r
356 X_HTMLParser_exec( html, worker );
\r
358 delete worker.flat;
\r
362 var X_HTMLParser_asyncHtmlStringToXNode = {
\r
364 err : function( html ){
\r
365 X_HTMLParser_htmlStringToXNode.err( html );
\r
366 this[ 'asyncDispatch' ]( X_EVENT_ERROR );
\r
368 start : X_HTMLParser_htmlStringToXNode.start,
\r
369 end : X_HTMLParser_htmlStringToXNode.end,
\r
370 chars : X_HTMLParser_htmlStringToXNode.chars,
\r
371 comment : X_emptyFunction,
\r
373 progress : function( pct ){
\r
374 this[ 'asyncDispatch' ]( { type : X_EVENT_PROGRESS, percent : pct } );
\r
376 complete : function(){
\r
377 var ret = X_HTMLParser_htmlStringToXNode.flat;
\r
378 delete X_HTMLParser_htmlStringToXNode.flat;
\r
379 this[ 'asyncDispatch' ]( { type : X_EVENT_SUCCESS, xnodes : ret } );
\r
383 function X_HTMLParser_asyncParse( html, ignoreError ){
\r
384 var dispatcher = X_Class_override( X_EventDispatcher(), X_HTMLParser_asyncHtmlStringToXNode ),
\r
385 worker = X_HTMLParser_htmlStringToXNode;
\r
386 dispatcher[ 'listenOnce' ]( X_EVENT_SUCCESS, dispatcher, dispatcher[ 'kill' ] );
\r
388 worker.nest.length = 0;
\r
389 worker.ignoreError = ignoreError;
\r
390 X_HTMLParser_exec( html, dispatcher, [ html.length, [] ] );
\r