3 * Original code by Erik John Resig (ejohn.org)
\r
4 * http://ejohn.org/blog/pure-javascript-html-parser/
\r
8 var X_Dom_Parser = { // HTMLParser
\r
10 A:1,B:1,C:1,D:1,E:1,F:1,G:1,H:1,I:1,J:1,K:1,L:1,M:1,N:1,O:1,P:1,Q:1,R:1,S:1,T:1,U:1,V:1,W:1,X:1,Y:1,Z:1,
\r
11 a:2,b:2,c:2,d:2,e:2,f:2,g:2,h:2,i:2,j:2,k:2,l:2,m:2,n:2,o:2,p:2,q:2,r:2,s:2,t:2,u:2,v:2,w:2,x:2,y:2,z:2,
\r
12 // "0" : 4, "1" : 4, "2" : 4, "3" : 4, "4" : 4, "5" : 4, "6" : 4, "7" : 4, "8" : 4, "9" : 4, closure compiler で minify すると ie4 で error、eval使う
\r
14 '\t' : 16, '\r\n' : 16, '\r' : 16, '\n' : 16, '\f' : 16, '\b' : 16, ' ' : 16
\r
16 alphabets : 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz',
\r
17 whiteSpace : '\t\r\n\f\b ',
\r
19 // Empty Elements - HTML 4.01
\r
20 empty : X_Dom_DTD_EMPTY,
\r
22 // Block Elements - HTML 4.01
\r
23 block : {ADDRESS:1,APPLET:1,BLOCKQUOTE:1,BUTTON:1,CENTER:1,DD:1,DEL:1,DIR:1,DIV:1,DL:1,DT:1,FIELDSET:1,FORM:1,FRAMESET:1,HR:1,IFRAME:1,INS:1,
\r
24 ISINDEX:1,LI:1,MAP:1,MENU:1,NOFRAMES:1,NOSCRIPT:1,OBJECT:1,OL:1,P:1,PRE:1,SCRIPT:1,TABLE:1,TBODY:1,TD:1,TFOOT:1,TH:1,THEAD:1,TR:1,UL:1 },
\r
25 // Inline Elements - HTML 4.01
\r
26 inline : {A:1,ABBR:1,ACRONYM:1,APPLET:1,B:1,BASEFONT:1,BDO:1,BIG:1,BR:1,BUTTON:1,CITE:1,CODE:1,DEL:1,DFN:1,EM:1,FONT:1,I:1,IFRAME:1,IMG:1,
\r
27 INPUT:1,INS:1,KBD:1,LABEL:1,MAP:1,OBJECT:1,Q:1,S:1,SAMP:1,SCRIPT:1,SELECT:1,SMALL:1,SPAN:1,STRIKE:1,STRONG:1,SUB:1,SUP:1,TEXTAREA:1,TT:1,U:1,VAR:1},
\r
28 // Elements that you can, intentionally, leave open
\r
29 // (and which close themselves)
\r
30 closeSelf : {OLGROUP:1,DD:1,DT:1,LI:1,OPTIONS:1,P:1,TBODY:1,TD:1,TFOOT:1,TH:1,THEAD:1,TR:1}, // add tbody
\r
37 COLGROUP : { CAPTION : 1 },
\r
38 THEAD : { CAPTION : 1, COLGROUP : 1 },
\r
39 TFOOT : { CAPTION : 1, COLGROUP : 1, THEAD : 1, TBODY : 1 },
\r
40 TBODY : { CAPTION : 1, COLGROUP : 1, THEAD : 1, TFOOT : 1 }
\r
43 * http://www.tohoho-web.com/html/tbody.htm
\r
44 * HTML4.01では、ヘッダとフッタを先読みして表示するために、<tbody> よりも <tfoot> の方を先に記述しなくてはならないと定義されています。
\r
45 * IE5.0 などでは HEAD → BODY → FOOT の順に表示するのですが、
\r
46 * <tfoot> に未対応の古いブラウザでは、HEAD → FOOT → BODY の順に表示されてしまいます。
\r
47 * また、HTML5 では、<tfoot> と <tbody> の順番はどちらでもよいことになりました。
\r
50 // Attributes that have their values filled in disabled="disabled"
\r
51 fillAttrs : X_Node_Attr.noValue, //{checked:1,compact:1,declare:1,defer:1,disabled:1,ismap:1,multiple:1,nohref:1,noresize:1,noshade:1,nowrap:1,readonly:1,selected:1};
\r
53 // Special Elements (can contain anything)
\r
54 special : { SCRIPT : 1, STYLE : 1, PLAINTEXT : 1, XMP : 1, TEXTAREA : 1 },
\r
56 exec : function( html, handler, async ){
\r
57 var special = X_Dom_Parser.special,
\r
58 //plainText = X_Dom_Parser.plainText,
\r
59 startTime = async && X_Timer_now(),
\r
60 _parseStartTag = X_Dom_Parser._parseStartTag,
\r
61 _parseEndTag = X_Dom_Parser._parseEndTag,
\r
62 stack = async ? async[ 1 ] : [],
\r
64 chars, last, text, index;
\r
68 last = stack[ stack.length - 1 ];
\r
70 // Make sure we're not in a script or style element
\r
71 if ( last && special[ last ] === 1 ) {
\r
72 if( 0 <= ( index = html.toUpperCase().indexOf( '</' + last ) ) ){
\r
73 handler.chars( html.substring( 0, index ) );
\r
74 if( index = _parseEndTag( stack, handler, html ) ){
\r
75 html = html.substring( index );
\r
77 handler.chars( html );
\r
81 handler.chars( html );
\r
86 if ( html.indexOf("<!--") === 0 ) {
\r
87 if ( 0 < ( index = html.indexOf("-->") ) ) {
\r
88 handler.comment( html.substring( 4, index ) );
\r
89 html = html.substring( index + 3 );
\r
94 } else if ( html.indexOf("</") === 0 ) {
\r
95 if ( 2 < ( index = _parseEndTag( stack, handler, html ) ) ) {
\r
96 html = html.substring( index );
\r
101 } else if ( html.indexOf("<") === 0 ) {
\r
102 if( index = _parseStartTag( stack, last, handler, html ) ){
\r
103 html = html.substring( index );
\r
106 if( index === false ){
\r
112 index = html.indexOf("<");
\r
114 text = index < 0 ? html : html.substring( 0, index );
\r
115 html = index < 0 ? '' : html.substring( index );
\r
117 handler.chars( text );
\r
122 if( html === lastHtml ){
\r
123 handler.err( html );
\r
127 if( async && startTime + 15 <= X_Timer_now() && html ){
\r
128 handler.progress( 1 - html.length / async[ 0 ] );
\r
129 X.Timer.once( 0, X_Dom_Parser.exec, [ html, handler, async ] );
\r
136 // Clean up any remaining tags
\r
137 X_Dom_Parser.parseEndTag( stack, handler );
\r
139 async && handler.complete();
\r
142 _parseStartTag : function( stack, last, handler, html ){
\r
143 var alphabets = X_Dom_Parser.CHARS,
\r
144 whiteSpace = X_Dom_Parser.CHARS,
\r
145 saveAttr = X_Dom_Parser.saveAttr,
\r
146 uri = X_Dom_DTD_ATTR_VAL_IS_URI,
\r
151 tagName, empty = false,
\r
152 chr, start, attrName, quot, escape;
\r
154 while( i < l && phase < 9 ){
\r
155 chr = html.charAt( i );
\r
158 chr === '<' && ( ++phase );
\r
160 case 1 : // タグ名の開始を待つ
\r
161 ( alphabets[ chr ] & 3 ) && ( ++phase && ( start = i ) );
\r
163 case 2 : // タグ名の終わりの空白文字を待つ
\r
164 ( whiteSpace[ chr ] & 16 ) ?
\r
165 ( ++phase && ( tagName = html.substring( start, i ) ) ) :
\r
166 ( chr === '>' || ( empty = html.substr( i, 2 ) === '/>' ) ) &&
\r
167 ( ( tagName = html.substring( start, i ) ) && ( phase = 9 ) );
\r
169 case 3 : // 属性名の開始を待つ
\r
170 ( alphabets[ chr ] & 3 ) ?
\r
171 ( ++phase && ( start = i ) ) :
\r
172 ( chr === '>' || ( empty = html.substr( i, 2 ) === '/>' ) ) &&
\r
175 case 4 : // 属性名の終わりを待つ
\r
177 ( ( phase = 6 ) && ( attrName = html.substring( start, i ) ) ) :
\r
178 ( whiteSpace[ chr ] & 16 ) &&
\r
179 ( ( phase = 5 ) && ( attrName = html.substring( start, i ) ) );
\r
181 case 5 : // 属性の = または次の属性または htmlタグの閉じ
\r
182 ( whiteSpace[ chr ] & 16 ) ?// ie4 未対応の属性には cite = http:// となる
\r
184 ( alphabets[ chr ] & 3 ) ?
\r
185 ( ( phase = 4 ) && ( attrs[ attrs.length ] = attrName ) && ( start = i ) ) :
\r
188 ( chr === '>' || ( empty = html.substr( i, 2 ) === '/>' ) ) &&
\r
189 ( ( phase = 9 ) && ( attrs[ attrs.length ] = attrName ) );
\r
191 case 6 : // 属性値の開始 quot を待つ
\r
192 ( chr === '"' || chr === "'" ) ?
\r
193 ( ( phase = 7 ) && ( quot = chr ) && ( start = i + 1 ) ):
\r
194 !( whiteSpace[ chr ] & 16 ) &&
\r
195 ( ( phase = 8 ) && ( start = i ) ); // no quot
\r
197 case 7 : //属性値の閉じ quot を待つ
\r
198 !escape && ( chr === quot ) && ( phase = 3 ) && saveAttr( attrs, attrName, html.substring( start, i ) );
\r
200 case 8 : //閉じ quot のない属性の値
\r
201 ( whiteSpace[ chr ] & 16 ) ?
\r
202 ( ( phase = 3 ) && saveAttr( attrs, attrName, html.substring( start, i ) ) ) :
\r
204 ( ( phase = 9 ) && saveAttr( attrs, attrName, html.substring( start, i ) ) ) :
\r
205 !escape && !uri[ attrName ] && ( empty = html.substr( i, 2 ) === '/>' ) && // attr の val が uri で / で終わりかつ、未対応属性の場合
\r
209 escape = chr === '\\' && !escape; // \\\\ is not escape for "
\r
214 if( X_Dom_Parser.parseStartTag( stack, last, handler, tagName.toUpperCase(), attrs, empty, i ) === false ) return false;
\r
220 _parseEndTag : function( stack, handler, html ){
\r
221 var alphabets = X_Dom_Parser.CHARS,
\r
222 whiteSpace = X_Dom_Parser.CHARS,
\r
229 while( i < l && phase < 9 ){
\r
230 chr = html.charAt( i );
\r
233 html.substr( i, 2 ) === '</' && ( ++phase && ++i );
\r
235 case 1 : // タグ名の開始を待つ
\r
236 ( alphabets[ chr ] & 3 ) && ( ++phase && ( start = i ) );
\r
238 case 2 : // タグ名の終わりの空白文字を待つ
\r
239 ( whiteSpace[ chr ] & 16 ) && ( ++phase );
\r
240 ( chr === '>' ) && ( phase = 9 );
\r
241 ( phase !== 2 ) && ( tagName = html.substring( start, i ) );
\r
243 case 3 : // タグの終了を待つ
\r
244 chr === '>' && ( phase = 9 );
\r
250 X_Dom_Parser.parseEndTag( stack, handler, tagName.toUpperCase() );
\r
256 saveAttr : function( attrs, name, value ){
\r
257 name = name.toLowerCase();
\r
258 value = X_Dom_Parser.fillAttrs[ name ] === 1 ? name : value;
\r
259 attrs[ attrs.length ] = {
\r
263 value.indexOf( '"' ) !== -1 ?
\r
264 value.split( '"' ).join( '\\"' ).split( '\\\\"' ).join( '\\"' ) :
\r
269 parseStartTag : function( stack, last, handler, tagName, attrs, empty, index ) {
\r
270 var inline = X_Dom_Parser.inline,
\r
271 parseEndTag = X_Dom_Parser.parseEndTag,
\r
272 sisters = X_Dom_Parser.sisters;
\r
273 if ( X_Dom_Parser.block[ tagName ] === 1 ) {
\r
274 while ( last && inline[ last ] === 1 ) {
\r
275 parseEndTag( stack, handler, last );
\r
276 last = stack[ stack.length - 1 ];
\r
279 last && X_Dom_Parser.closeSelf[ tagName ] === 1 && ( last === tagName || ( sisters[ tagName ] && sisters[ tagName ][ last ] === 1 ) ) && parseEndTag( stack, handler, last );
\r
280 empty = empty || X_Dom_Parser.empty[ tagName ];
\r
281 !empty && ( stack[ stack.length ] = tagName );
\r
283 return handler.start( tagName, attrs, empty, index );
\r
286 parseEndTag : function( stack, handler, tagName ) {
\r
287 var pos = 0, i = stack.length;
\r
288 // If no tag name is provided, clean shop
\r
290 // Find the closest opened tag of the same type
\r
292 for ( pos = i; 0 <= pos; )
\r
293 if ( stack[ --pos ] === tagName )
\r
297 // Close all the open elements, up the stack
\r
299 handler.end( stack[ --i ] );
\r
301 // Remove the open elements from the stack
\r
302 stack.length = pos;
\r
308 var X_HTMLParser_htmlStringToXNode = {
\r
311 err : function( html ){
\r
312 X_HTMLParser_htmlStringToXNode.flat.length = 0;
\r
313 X_HTMLParser_htmlStringToXNode.ignoreError !== true && X.Logger.warn( 'X_Dom_Parser() error ' + html );
\r
315 start : function( tagName, attrs, noChild, length ){
\r
317 nest = X_HTMLParser_htmlStringToXNode.nest,
\r
318 flat = X_HTMLParser_htmlStringToXNode.flat,
\r
320 attr, name, i, _attrs; //, toIndex;
\r
322 xnode = nest[ l - 1 ].create( tagName );
\r
324 xnode = flat[ flat.length ] = X.Dom.Node.create( tagName );
\r
326 if( !noChild ) nest[ l ] = xnode;
\r
327 if( i = attrs.length ){
\r
330 if( attr = attrs[ --i ] ){
\r
331 if( X.Type.isString( attr ) ){
\r
333 _attrs[ name ] = true;
\r
336 _attrs[ name ] = attr.escaped;
\r
340 xnode.attr( _attrs );
\r
344 0 < X_HTMLParser_htmlStringToXNode.nest.length && ( --X_HTMLParser_htmlStringToXNode.nest.length );
\r
346 chars : function( text ){
\r
347 if( X_HTMLParser_htmlStringToXNode.nest.length ){
\r
348 X_HTMLParser_htmlStringToXNode.nest[ X_HTMLParser_htmlStringToXNode.nest.length - 1 ].createText( text );
\r
350 X_HTMLParser_htmlStringToXNode.flat[ X_HTMLParser_htmlStringToXNode.flat.length ] = X.Dom.Node.createText( text );
\r
353 comment : X.emptyFunction
\r
356 function X_HtmlParser_parse( html, ignoreError ){
\r
357 var worker = X_HTMLParser_htmlStringToXNode, ret;
\r
359 worker.nest.length = 0;
\r
360 worker.ignoreError = ignoreError;
\r
361 X_Dom_Parser.exec( html, worker );
\r
363 delete worker.flat;
\r
367 var X_HTMLParser_asyncHtmlStringToXNode = {
\r
368 err : function( html ){
\r
369 X_HTMLParser_htmlStringToXNode.err( html );
\r
370 this.asyncDispatch( X.Event.ERROR );
\r
372 start : X_HTMLParser_htmlStringToXNode.start,
\r
373 end : X_HTMLParser_htmlStringToXNode.end,
\r
374 chars : X_HTMLParser_htmlStringToXNode.chars,
\r
375 comment : X.emptyFunction,
\r
377 progress : function( pct ){
\r
378 this.asyncDispatch( { type : X.Event.PROGRESS, percent : pct } );
\r
380 complete : function(){
\r
381 var ret = X_HTMLParser_htmlStringToXNode.flat;
\r
382 delete X_HTMLParser_htmlStringToXNode.flat;
\r
383 this.asyncDispatch( { type : X.Event.SUCCESS, xnodes : ret } );
\r
387 function X_HTMLParser_asyncParse( html, ignoreError ){
\r
388 var dispatcher = X.Class._override( new X.EventDispatcher(), X_HTMLParser_asyncHtmlStringToXNode ),
\r
389 worker = X_HTMLParser_htmlStringToXNode;
\r
390 dispatcher.listenOnce( X.Event.SUCCESS, dispatcher, dispatcher.kill );
\r
392 worker.nest.length = 0;
\r
393 worker.ignoreError = ignoreError;
\r
394 X_Dom_Parser.exec( html, dispatcher, [ html.length, [] ] );
\r