HTMLPurifier/Lexer/DOMLex.php Quellcode

DOMLex.php
gehe zur Dokumentation dieser Datei
1 <?php
2 
28 {
29 
33  private $factory;
34 
35  public function __construct()
36  {
37  // setup the factory
38  parent::__construct();
39  $this->factory = new HTMLPurifier_TokenFactory();
40  }
41 
48  public function tokenizeHTML($html, $config, $context)
49  {
50  $html = $this->normalize($html, $config, $context);
51 
52  // attempt to armor stray angled brackets that cannot possibly
53  // form tags and thus are probably being used as emoticons
54  if ($config->get('Core.AggressivelyFixLt')) {
55  $char = '[^a-z!\/]';
56  $comment = "/<!--(.*?)(-->|\z)/is";
57  $html = preg_replace_callback($comment, array($this, 'callbackArmorCommentEntities'), $html);
58  do {
59  $old = $html;
60  $html = preg_replace("/<($char)/i", '&lt;\\1', $html);
61  } while ($html !== $old);
62  $html = preg_replace_callback($comment, array($this, 'callbackUndoCommentSubst'), $html); // fix comments
63  }
64 
65  // preprocess html, essential for UTF-8
66  $html = $this->wrapHTML($html, $config, $context);
67 
68  $doc = new DOMDocument();
69  $doc->encoding = 'UTF-8'; // theoretically, the above has this covered
70 
71  set_error_handler(array($this, 'muteErrorHandler'));
72  $doc->loadHTML($html);
73  restore_error_handler();
74 
75  $tokens = array();
76  $this->tokenizeDOM(
77  $doc->getElementsByTagName('html')->item(0)-> // <html>
78  getElementsByTagName('body')->item(0)-> // <body>
79  getElementsByTagName('div')->item(0), // <div>
80  $tokens
81  );
82  return $tokens;
83  }
84 
92  protected function tokenizeDOM($node, &$tokens)
93  {
94  $level = 0;
95  $nodes = array($level => new HTMLPurifier_Queue(array($node)));
96  $closingNodes = array();
97  do {
98  while (!$nodes[$level]->isEmpty()) {
99  $node = $nodes[$level]->shift(); // FIFO
100  $collect = $level > 0 ? true : false;
101  $needEndingTag = $this->createStartNode($node, $tokens, $collect);
102  if ($needEndingTag) {
103  $closingNodes[$level][] = $node;
104  }
105  if ($node->childNodes && $node->childNodes->length) {
106  $level++;
107  $nodes[$level] = new HTMLPurifier_Queue();
108  foreach ($node->childNodes as $childNode) {
109  $nodes[$level]->push($childNode);
110  }
111  }
112  }
113  $level--;
114  if ($level && isset($closingNodes[$level])) {
115  while ($node = array_pop($closingNodes[$level])) {
116  $this->createEndNode($node, $tokens);
117  }
118  }
119  } while ($level > 0);
120  }
121 
131  protected function createStartNode($node, &$tokens, $collect)
132  {
133  // intercept non element nodes. WE MUST catch all of them,
134  // but we're not getting the character reference nodes because
135  // those should have been preprocessed
136  if ($node->nodeType === XML_TEXT_NODE) {
137  $tokens[] = $this->factory->createText($node->data);
138  return false;
139  } elseif ($node->nodeType === XML_CDATA_SECTION_NODE) {
140  // undo libxml's special treatment of <script> and <style> tags
141  $last = end($tokens);
142  $data = $node->data;
143  // (note $node->tagname is already normalized)
144  if ($last instanceof HTMLPurifier_Token_Start && ($last->name == 'script' || $last->name == 'style')) {
145  $new_data = trim($data);
146  if (substr($new_data, 0, 4) === '<!--') {
147  $data = substr($new_data, 4);
148  if (substr($data, -3) === '-->') {
149  $data = substr($data, 0, -3);
150  } else {
151  // Highly suspicious! Not sure what to do...
152  }
153  }
154  }
155  $tokens[] = $this->factory->createText($this->parseData($data));
156  return false;
157  } elseif ($node->nodeType === XML_COMMENT_NODE) {
158  // this is code is only invoked for comments in script/style in versions
159  // of libxml pre-2.6.28 (regular comments, of course, are still
160  // handled regularly)
161  $tokens[] = $this->factory->createComment($node->data);
162  return false;
163  } elseif ($node->nodeType !== XML_ELEMENT_NODE) {
164  // not-well tested: there may be other nodes we have to grab
165  return false;
166  }
167 
168  $attr = $node->hasAttributes() ? $this->transformAttrToAssoc($node->attributes) : array();
169 
170  // We still have to make sure that the element actually IS empty
171  if (!$node->childNodes->length) {
172  if ($collect) {
173  $tokens[] = $this->factory->createEmpty($node->tagName, $attr);
174  }
175  return false;
176  } else {
177  if ($collect) {
178  $tokens[] = $this->factory->createStart(
179  $tag_name = $node->tagName, // somehow, it get's dropped
180  $attr
181  );
182  }
183  return true;
184  }
185  }
186 
191  protected function createEndNode($node, &$tokens)
192  {
193  $tokens[] = $this->factory->createEnd($node->tagName);
194  }
195 
196 
203  protected function transformAttrToAssoc($node_map)
204  {
205  // NamedNodeMap is documented very well, so we're using undocumented
206  // features, namely, the fact that it implements Iterator and
207  // has a ->length attribute
208  if ($node_map->length === 0) {
209  return array();
210  }
211  $array = array();
212  foreach ($node_map as $attr) {
213  $array[$attr->name] = $attr->value;
214  }
215  return $array;
216  }
217 
223  public function muteErrorHandler($errno, $errstr)
224  {
225  }
226 
233  public function callbackUndoCommentSubst($matches)
234  {
235  return '<!--' . strtr($matches[1], array('&amp;' => '&', '&lt;' => '<')) . $matches[2];
236  }
237 
244  public function callbackArmorCommentEntities($matches)
245  {
246  return '<!--' . str_replace('&', '&amp;', $matches[1]) . $matches[2];
247  }
248 
256  protected function wrapHTML($html, $config, $context)
257  {
258  $def = $config->getDefinition('HTML');
259  $ret = '';
260 
261  if (!empty($def->doctype->dtdPublic) || !empty($def->doctype->dtdSystem)) {
262  $ret .= '<!DOCTYPE html ';
263  if (!empty($def->doctype->dtdPublic)) {
264  $ret .= 'PUBLIC "' . $def->doctype->dtdPublic . '" ';
265  }
266  if (!empty($def->doctype->dtdSystem)) {
267  $ret .= '"' . $def->doctype->dtdSystem . '" ';
268  }
269  $ret .= '>';
270  }
271 
272  $ret .= '<html><head>';
273  $ret .= '<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />';
274  // No protection if $html contains a stray </div>!
275  $ret .= '</head><body><div>' . $html . '</div></body></html>';
276  return $ret;
277  }
278 }
279 
280 // vim: et sw=4 sts=4




Korrekturen, Hinweise und Ergänzungen

Bitte scheuen Sie sich nicht und melden Sie, was auf dieser Seite sachlich falsch oder irreführend ist, was ergänzt werden sollte, was fehlt usw. Dazu bitte oben aus dem Menü Seite den Eintrag Support Forum wählen. Es ist eine kostenlose Anmeldung erforderlich, um Anmerkungen zu posten. Unpassende Postings, Spam usw. werden kommentarlos entfernt.