2 * Original code by Erik John Resig (ejohn.org)
3 * http://ejohn.org/blog/pure-javascript-html-parser/
9 var alphabets = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz',
10 whiteSpace = '\t\r\n\f\b ';
12 // Empty Elements - HTML 4.01
13 var empty = {area:1,base:1,basefont:1,br:1,col:1,frame:1,hr:1,img:1,input:1,isindex:1,link:1,meta:1,param:1,embed:1};
15 // Block Elements - HTML 4.01
16 var block = {address:1,applet:1,blockquote:1,button:1,center:1,dd:1,del:1,dir:1,div:1,dl:1,dt:1,fieldset:1,form:1,frameset:1,hr:1,iframe:1,ins:1,isindex:1,li:1,map:1,menu:1,noframes:1,noscript:1,object:1,ol:1,p:1,pre:1,script:1,table:1,tbody:1,td:1,tfoot:1,th:1,thead:1,tr:1,ul:1};
18 // Inline Elements - HTML 4.01
19 var inline = {a:1,abbr:1,acronym:1,applet:1,b:1,basefont:1,bdo:1,big:1,br:1,button:1,cite:1,code:1,del:1,dfn:1,em:1,font:1,i:1,iframe:1,img:1,input:1,ins:1,kbd:1,label:1,map:1,object:1,q:1,s:1,samp:1,script:1,select:1,small:1,span:1,strike:1,strong:1,sub:1,sup:1,textarea:1,tt:1,u:1,'var':1};
21 // Elements that you can, intentionally, leave open
22 // (and which close themselves)
23 var closeSelf = {colgroup:1,dd:1,dt:1,li:1,options:1,p:1,tbody:1,td:1,tfoot:1,th:1,thead:1,tr:1}; // add tbody
30 colgroup : { caption : 1 },
31 thead : { caption : 1, colgroup : 1 },
32 tfoot : { caption : 1, colgroup : 1, thead : 1, tbody : 1 },
33 tbody : { caption : 1, colgroup : 1, thead : 1, tfoot : 1 }
36 * http://www.tohoho-web.com/html/tbody.htm
37 * HTML4.01では、ヘッダとフッタを先読みして表示するために、<tbody> よりも <tfoot> の方を先に記述しなくてはならないと定義されています。
38 * IE5.0 などでは HEAD → BODY → FOOT の順に表示するのですが、
39 * <tfoot> に未対応の古いブラウザでは、HEAD → FOOT → BODY の順に表示されてしまいます。
40 * また、HTML5 では、<tfoot> と <tbody> の順番はどちらでもよいことになりました。
43 // Attributes that have their values filled in disabled="disabled"
44 var fillAttrs = X.Dom.Attr.noValue; //{checked:1,compact:1,declare:1,defer:1,disabled:1,ismap:1,multiple:1,nohref:1,noresize:1,noshade:1,nowrap:1,readonly:1,selected:1};
46 // Special Elements (can contain anything)
47 var special = {script:1,style:1};
49 X.Dom.Parser = function( html, handler ) {
52 chars, last, text, index;
56 last = stack[ stack.length - 1 ];
58 // Make sure we're not in a script or style element
59 if ( last && special[ last.toLowerCase() ] === 1 ) {
60 if( 0 <= ( index = _parseEndTag( stack, handler, html ) ) ){
61 //handler.chars( html.substring( 0, index ) );
62 html = html.substring( index );
66 if ( html.indexOf("<!--") === 0 ) {
67 if ( 0 < ( index = html.indexOf("-->") ) ) {
68 handler.comment( html.substring( 4, index ) );
69 html = html.substring( index + 3 );
74 } else if ( html.indexOf("</") === 0 ) {
75 if ( 2 < ( index = _parseEndTag( stack, handler, html ) ) ) {
76 html = html.substring( index );
81 } else if ( html.indexOf("<") === 0 ) {
82 if( index = _parseStartTag( stack, last, handler, html ) ){
83 html = html.substring( index );
86 if( index === false ){
92 index = html.indexOf("<");
94 text = index < 0 ? html : html.substring( 0, index );
95 html = index < 0 ? '' : html.substring( index );
97 handler.chars( text );
102 if ( html === lastHtml ){
109 // Clean up any remaining tags
110 parseEndTag( stack, handler );
113 function _parseStartTag( stack, last, handler, html ){
119 chr, start, attrName, quot, escape;
121 while( i < l && phase < 9 ){
122 chr = html.charAt( i );
125 chr === '<' && ( ++phase );
127 case 1 : // タグ名の開始を待つ
128 alphabets.indexOf( chr ) !== -1 && ( ++phase && ( start = i ) );
130 case 2 : // タグ名の終わりの空白文字を待つ
131 whiteSpace.indexOf( chr ) !== -1 ?
132 ( ++phase && ( tagName = html.substring( start, i ) ) ) :
133 ( chr === '>' || ( empty = html.substr( i, 2 ) === '/>' ) ) &&
134 ( ( tagName = html.substring( start, i ) ) && ( phase = 9 ) );
136 case 3 : // 属性名の開始を待つ
137 alphabets.indexOf( chr ) !== -1 ?
138 ( ++phase && ( start = i ) ) :
139 ( chr === '>' || ( empty = html.substr( i, 2 ) === '/>' ) ) &&
142 case 4 : // 属性名の終わりを待つ
144 ( ( phase = 6 ) && ( attrName = html.substring( start, i ) ) ) :
145 whiteSpace.indexOf( chr ) !== -1 &&
146 ( ( phase = 5 ) && ( attrName = html.substring( start, i ) ) );
148 case 5 : // 属性の = または次の属性または htmlタグの閉じ
149 alphabets.indexOf( chr ) !== -1 ?
150 ( ( phase = 4 ) && ( attrs[ attrs.length ] = attrName ) && ( start = i ) ) :
153 ( chr === '>' || ( empty = html.substr( i, 2 ) === '/>' ) ) &&
154 ( ( phase = 9 ) && ( attrs[ attrs.length ] = attrName ) );
156 case 6 : // 属性値の開始 quot を待つ
157 ( chr === '"' || chr === "'" ) ?
158 ( ( phase = 7 ) && ( quot = chr ) && ( start = i + 1 ) ):
159 whiteSpace.indexOf( chr ) === -1 &&
160 ( ( phase = 8 ) && ( start = i ) ); // no quot
162 case 7 : //属性値の閉じ quot を待つ
163 !escape && ( chr === quot ) && ( phase = 3 ) && saveAttr( attrs, attrName, html.substring( start, i ) );
165 case 8 : //閉じ quot のない属性の値
166 whiteSpace.indexOf( chr ) !== -1 ?
167 ( ( phase = 3 ) && saveAttr( attrs, attrName, html.substring( start, i ) ) ) :
168 ( chr === '>' || ( empty = html.substr( i, 2 ) === '/>' ) ) &&
169 ( ( phase = 9 ) && saveAttr( attrs, attrName, html.substring( start, i ) ) );
172 escape = chr === '\\' && !escape; // \\\\ is not escape for "
176 if( parseStartTag( stack, last, handler, tagName /*.toLowerCase() */, attrs, empty, i ) === false ) return false;
182 function _parseEndTag( stack, handler, html ){
189 while( i < l && phase < 9 ){
190 chr = html.charAt( i );
193 html.substr( i, 2 ) === '</' && ( ++phase && ++i );
195 case 1 : // タグ名の開始を待つ
196 alphabets.indexOf( chr ) !== -1 && ( ++phase && ( start = i ) );
198 case 2 : // タグ名の終わりの空白文字を待つ
199 whiteSpace.indexOf( chr ) !== -1 && ( ++phase );
200 ( chr === '>' ) && ( phase = 9 );
201 ( phase !== 2 ) && ( tagName = html.substring( start, i ) );
203 case 3 : // 属性名の開始を待つ
204 chr === '>' && ( phase = 9 );
210 parseEndTag( stack, handler, tagName ); //.toLowerCase()
216 function saveAttr( attrs, name, value ){
217 name = name.toLowerCase();
218 value = fillAttrs[ name ] === 1 ? name : value;
219 attrs[ attrs.length ] = {
223 value.indexOf( '"' ) !== -1 ?
224 value.split( '"' ).join( '\\"' ).split( '\\\\"' ).join( '\\"' ) :
229 function parseStartTag( stack, last, handler, tagName, attrs, unary, index ) {
230 if ( block[ tagName.toLowerCase() ] === 1 ) {
231 while ( last && inline[ last.toLowerCase() ] === 1 ) {
232 parseEndTag( stack, handler, last );
233 last = stack[ stack.length - 1 ];
236 closeSelf[ tagName.toLowerCase() ] === 1 && ( last === tagName || ( sisters[ tagName.toLowerCase() ] && sisters[ tagName.toLowerCase() ][ last.toLowerCase() ] === 1 ) ) && parseEndTag( stack, handler, last );
237 unary = empty[ tagName.toLowerCase() ] === 1 || !!unary;
238 !unary && ( stack[ stack.length ] = tagName );
240 return handler.start( tagName, attrs, unary, index );
243 function parseEndTag( stack, handler, tagName ) {
244 var pos = 0, i = stack.length;
245 // If no tag name is provided, clean shop
247 // Find the closest opened tag of the same type
249 for ( pos = i; 0 <= pos; )
250 if ( stack[ --pos ] === tagName )
254 // Close all the open elements, up the stack
256 handler.end( stack[ --i ] );
258 // Remove the open elements from the stack
265 X.Dom._htmlStringToXNode = {
268 err : function( html ){
269 this.flat.length = 0;
270 this.ignoreError !== true && X.Notification.warn( 'X.Dom.Parser() error ' + html );
272 start : function( tagName, attrs, noChild, length ){
277 attr, name, i, _attrs; //, toIndex;
279 xnode = nest[ l - 1 ].create( tagName );
281 xnode = flat[ flat.length ] = X.Dom.Node.create( tagName );
283 if( !noChild ) nest[ l ] = xnode;
284 if( i = attrs.length ){
285 //toIndex = X.Dom.Attr.toIndex;
288 if( attr = attrs[ --i ] ){
289 if( typeof attr === 'string' ){
291 //i = toIndex[ name ];
292 //_attrs[ ( i || i === 0 ) ? i : name ] = name;
293 _attrs[ name ] = true;
296 //i = toIndex[ name ];
297 //_attrs[ ( i || i === 0 ) ? i : name ] = attr.escaped;
298 _attrs[ name ] = attr.escaped;
302 xnode.attr( _attrs );
306 0 < this.nest.length && ( --this.nest.length );
308 chars : function( text ){
309 if( this.nest.length ){
310 this.nest[ this.nest.length - 1 ].createText( text );
312 this.flat[ this.flat.length ] = X.Dom.Node.createText( text );
315 comment : X.emptyFunction
318 X.Dom.parse = function( html, ignoreError ){
319 var worker = X.Dom._htmlStringToXNode, ret;
321 worker.nest.length = 0;
322 worker.ignoreError = ignoreError;
323 X.Dom.Parser( html, worker );