00001 <?php 00002 00027 class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer 00028 { 00029 00030 private $factory; 00031 00032 public function __construct() { 00033 // setup the factory 00034 parent::__construct(); 00035 $this->factory = new HTMLPurifier_TokenFactory(); 00036 } 00037 00038 public function tokenizeHTML($html, $config, $context) { 00039 00040 $html = $this->normalize($html, $config, $context); 00041 00042 // attempt to armor stray angled brackets that cannot possibly 00043 // form tags and thus are probably being used as emoticons 00044 if ($config->get('Core.AggressivelyFixLt')) { 00045 $char = '[^a-z!\/]'; 00046 $comment = "/<!--(.*?)(-->|\z)/is"; 00047 $html = preg_replace_callback($comment, array($this, 'callbackArmorCommentEntities'), $html); 00048 do { 00049 $old = $html; 00050 $html = preg_replace("/<($char)/i", '<\\1', $html); 00051 } while ($html !== $old); 00052 $html = preg_replace_callback($comment, array($this, 'callbackUndoCommentSubst'), $html); // fix comments 00053 } 00054 00055 // preprocess html, essential for UTF-8 00056 $html = $this->wrapHTML($html, $config, $context); 00057 00058 $doc = new DOMDocument(); 00059 $doc->encoding = 'UTF-8'; // theoretically, the above has this covered 00060 00061 set_error_handler(array($this, 'muteErrorHandler')); 00062 $doc->loadHTML($html); 00063 restore_error_handler(); 00064 00065 $tokens = array(); 00066 $this->tokenizeDOM( 00067 $doc->getElementsByTagName('html')->item(0)-> // <html> 00068 getElementsByTagName('body')->item(0)-> // <body> 00069 getElementsByTagName('div')->item(0) // <div> 00070 , $tokens); 00071 return $tokens; 00072 } 00073 00084 protected function tokenizeDOM($node, &$tokens, $collect = false) { 00085 00086 // intercept non element nodes. WE MUST catch all of them, 00087 // but we're not getting the character reference nodes because 00088 // those should have been preprocessed 00089 if ($node->nodeType === XML_TEXT_NODE) { 00090 $tokens[] = $this->factory->createText($node->data); 00091 return; 00092 } elseif ($node->nodeType === XML_CDATA_SECTION_NODE) { 00093 // undo libxml's special treatment of <script> and <style> tags 00094 $last = end($tokens); 00095 $data = $node->data; 00096 // (note $node->tagname is already normalized) 00097 if ($last instanceof HTMLPurifier_Token_Start && ($last->name == 'script' || $last->name == 'style')) { 00098 $new_data = trim($data); 00099 if (substr($new_data, 0, 4) === '<!--') { 00100 $data = substr($new_data, 4); 00101 if (substr($data, -3) === '-->') { 00102 $data = substr($data, 0, -3); 00103 } else { 00104 // Highly suspicious! Not sure what to do... 00105 } 00106 } 00107 } 00108 $tokens[] = $this->factory->createText($this->parseData($data)); 00109 return; 00110 } elseif ($node->nodeType === XML_COMMENT_NODE) { 00111 // this is code is only invoked for comments in script/style in versions 00112 // of libxml pre-2.6.28 (regular comments, of course, are still 00113 // handled regularly) 00114 $tokens[] = $this->factory->createComment($node->data); 00115 return; 00116 } elseif ( 00117 // not-well tested: there may be other nodes we have to grab 00118 $node->nodeType !== XML_ELEMENT_NODE 00119 ) { 00120 return; 00121 } 00122 00123 $attr = $node->hasAttributes() ? 00124 $this->transformAttrToAssoc($node->attributes) : 00125 array(); 00126 00127 // We still have to make sure that the element actually IS empty 00128 if (!$node->childNodes->length) { 00129 if ($collect) { 00130 $tokens[] = $this->factory->createEmpty($node->tagName, $attr); 00131 } 00132 } else { 00133 if ($collect) { // don't wrap on first iteration 00134 $tokens[] = $this->factory->createStart( 00135 $tag_name = $node->tagName, // somehow, it get's dropped 00136 $attr 00137 ); 00138 } 00139 foreach ($node->childNodes as $node) { 00140 // remember, it's an accumulator. Otherwise, we'd have 00141 // to use array_merge 00142 $this->tokenizeDOM($node, $tokens, true); 00143 } 00144 if ($collect) { 00145 $tokens[] = $this->factory->createEnd($tag_name); 00146 } 00147 } 00148 00149 } 00150 00157 protected function transformAttrToAssoc($node_map) { 00158 // NamedNodeMap is documented very well, so we're using undocumented 00159 // features, namely, the fact that it implements Iterator and 00160 // has a ->length attribute 00161 if ($node_map->length === 0) return array(); 00162 $array = array(); 00163 foreach ($node_map as $attr) { 00164 $array[$attr->name] = $attr->value; 00165 } 00166 return $array; 00167 } 00168 00172 public function muteErrorHandler($errno, $errstr) {} 00173 00178 public function callbackUndoCommentSubst($matches) { 00179 return '<!--' . strtr($matches[1], array('&'=>'&','<'=>'<')) . $matches[2]; 00180 } 00181 00186 public function callbackArmorCommentEntities($matches) { 00187 return '<!--' . str_replace('&', '&', $matches[1]) . $matches[2]; 00188 } 00189 00193 protected function wrapHTML($html, $config, $context) { 00194 $def = $config->getDefinition('HTML'); 00195 $ret = ''; 00196 00197 if (!empty($def->doctype->dtdPublic) || !empty($def->doctype->dtdSystem)) { 00198 $ret .= '<!DOCTYPE html '; 00199 if (!empty($def->doctype->dtdPublic)) $ret .= 'PUBLIC "' . $def->doctype->dtdPublic . '" '; 00200 if (!empty($def->doctype->dtdSystem)) $ret .= '"' . $def->doctype->dtdSystem . '" '; 00201 $ret .= '>'; 00202 } 00203 00204 $ret .= '<html><head>'; 00205 $ret .= '<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />'; 00206 // No protection if $html contains a stray </div>! 00207 $ret .= '</head><body><div>'.$html.'</div></body></html>'; 00208 return $ret; 00209 } 00210 00211 } 00212 00213 // vim: et sw=4 sts=4
| Copyright © 2003 - 2009 MyOOS [Shopsystem]. All rights reserved. MyOOS [Shopsystem] is Free Software released under the GNU/GPL License. Webmaster: info@r23.de (Impressum) |
|
