HTMLPurifier/Lexer/DirectLex.php Quellcode

DirectLex.php
gehe zur Dokumentation dieser Datei
1 <?php
2 
14 {
18  public $tracksLineNumbers = true;
19 
24  protected $_whitespace = "\x20\x09\x0D\x0A";
25 
31  protected function scriptCallback($matches)
32  {
33  return $matches[1] . htmlspecialchars($matches[2], ENT_COMPAT, 'UTF-8') . $matches[3];
34  }
35 
42  public function tokenizeHTML($html, $config, $context)
43  {
44  // special normalization for script tags without any armor
45  // our "armor" heurstic is a < sign any number of whitespaces after
46  // the first script tag
47  if ($config->get('HTML.Trusted')) {
48  $html = preg_replace_callback(
49  '#(<script[^>]*>)(\s*[^<].+?)(</script>)#si',
50  array($this, 'scriptCallback'),
51  $html
52  );
53  }
54 
55  $html = $this->normalize($html, $config, $context);
56 
57  $cursor = 0; // our location in the text
58  $inside_tag = false; // whether or not we're parsing the inside of a tag
59  $array = array(); // result array
60 
61  // This is also treated to mean maintain *column* numbers too
62  $maintain_line_numbers = $config->get('Core.MaintainLineNumbers');
63 
64  if ($maintain_line_numbers === null) {
65  // automatically determine line numbering by checking
66  // if error collection is on
67  $maintain_line_numbers = $config->get('Core.CollectErrors');
68  }
69 
70  if ($maintain_line_numbers) {
71  $current_line = 1;
72  $current_col = 0;
73  $length = strlen($html);
74  } else {
75  $current_line = false;
76  $current_col = false;
77  $length = false;
78  }
79  $context->register('CurrentLine', $current_line);
80  $context->register('CurrentCol', $current_col);
81  $nl = "\n";
82  // how often to manually recalculate. This will ALWAYS be right,
83  // but it's pretty wasteful. Set to 0 to turn off
84  $synchronize_interval = $config->get('Core.DirectLexLineNumberSyncInterval');
85 
86  $e = false;
87  if ($config->get('Core.CollectErrors')) {
88  $e =& $context->get('ErrorCollector');
89  }
90 
91  // for testing synchronization
92  $loops = 0;
93 
94  while (++$loops) {
95  // $cursor is either at the start of a token, or inside of
96  // a tag (i.e. there was a < immediately before it), as indicated
97  // by $inside_tag
98 
99  if ($maintain_line_numbers) {
100  // $rcursor, however, is always at the start of a token.
101  $rcursor = $cursor - (int)$inside_tag;
102 
103  // Column number is cheap, so we calculate it every round.
104  // We're interested at the *end* of the newline string, so
105  // we need to add strlen($nl) == 1 to $nl_pos before subtracting it
106  // from our "rcursor" position.
107  $nl_pos = strrpos($html, $nl, $rcursor - $length);
108  $current_col = $rcursor - (is_bool($nl_pos) ? 0 : $nl_pos + 1);
109 
110  // recalculate lines
111  if ($synchronize_interval && // synchronization is on
112  $cursor > 0 && // cursor is further than zero
113  $loops % $synchronize_interval === 0) { // time to synchronize!
114  $current_line = 1 + $this->substrCount($html, $nl, 0, $cursor);
115  }
116  }
117 
118  $position_next_lt = strpos($html, '<', $cursor);
119  $position_next_gt = strpos($html, '>', $cursor);
120 
121  // triggers on "<b>asdf</b>" but not "asdf <b></b>"
122  // special case to set up context
123  if ($position_next_lt === $cursor) {
124  $inside_tag = true;
125  $cursor++;
126  }
127 
128  if (!$inside_tag && $position_next_lt !== false) {
129  // We are not inside tag and there still is another tag to parse
130  $token = new
132  $this->parseData(
133  substr(
134  $html,
135  $cursor,
136  $position_next_lt - $cursor
137  )
138  )
139  );
140  if ($maintain_line_numbers) {
141  $token->rawPosition($current_line, $current_col);
142  $current_line += $this->substrCount($html, $nl, $cursor, $position_next_lt - $cursor);
143  }
144  $array[] = $token;
145  $cursor = $position_next_lt + 1;
146  $inside_tag = true;
147  continue;
148  } elseif (!$inside_tag) {
149  // We are not inside tag but there are no more tags
150  // If we're already at the end, break
151  if ($cursor === strlen($html)) {
152  break;
153  }
154  // Create Text of rest of string
155  $token = new
157  $this->parseData(
158  substr(
159  $html,
160  $cursor
161  )
162  )
163  );
164  if ($maintain_line_numbers) {
165  $token->rawPosition($current_line, $current_col);
166  }
167  $array[] = $token;
168  break;
169  } elseif ($inside_tag && $position_next_gt !== false) {
170  // We are in tag and it is well formed
171  // Grab the internals of the tag
172  $strlen_segment = $position_next_gt - $cursor;
173 
174  if ($strlen_segment < 1) {
175  // there's nothing to process!
176  $token = new HTMLPurifier_Token_Text('<');
177  $cursor++;
178  continue;
179  }
180 
181  $segment = substr($html, $cursor, $strlen_segment);
182 
183  if ($segment === false) {
184  // somehow, we attempted to access beyond the end of
185  // the string, defense-in-depth, reported by Nate Abele
186  break;
187  }
188 
189  // Check if it's a comment
190  if (substr($segment, 0, 3) === '!--') {
191  // re-determine segment length, looking for -->
192  $position_comment_end = strpos($html, '-->', $cursor);
193  if ($position_comment_end === false) {
194  // uh oh, we have a comment that extends to
195  // infinity. Can't be helped: set comment
196  // end position to end of string
197  if ($e) {
198  $e->send(E_WARNING, 'Lexer: Unclosed comment');
199  }
200  $position_comment_end = strlen($html);
201  $end = true;
202  } else {
203  $end = false;
204  }
205  $strlen_segment = $position_comment_end - $cursor;
206  $segment = substr($html, $cursor, $strlen_segment);
207  $token = new
209  substr(
210  $segment,
211  3,
212  $strlen_segment - 3
213  )
214  );
215  if ($maintain_line_numbers) {
216  $token->rawPosition($current_line, $current_col);
217  $current_line += $this->substrCount($html, $nl, $cursor, $strlen_segment);
218  }
219  $array[] = $token;
220  $cursor = $end ? $position_comment_end : $position_comment_end + 3;
221  $inside_tag = false;
222  continue;
223  }
224 
225  // Check if it's an end tag
226  $is_end_tag = (strpos($segment, '/') === 0);
227  if ($is_end_tag) {
228  $type = substr($segment, 1);
229  $token = new HTMLPurifier_Token_End($type);
230  if ($maintain_line_numbers) {
231  $token->rawPosition($current_line, $current_col);
232  $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor);
233  }
234  $array[] = $token;
235  $inside_tag = false;
236  $cursor = $position_next_gt + 1;
237  continue;
238  }
239 
240  // Check leading character is alnum, if not, we may
241  // have accidently grabbed an emoticon. Translate into
242  // text and go our merry way
243  if (!ctype_alpha($segment[0])) {
244  // XML: $segment[0] !== '_' && $segment[0] !== ':'
245  if ($e) {
246  $e->send(E_NOTICE, 'Lexer: Unescaped lt');
247  }
248  $token = new HTMLPurifier_Token_Text('<');
249  if ($maintain_line_numbers) {
250  $token->rawPosition($current_line, $current_col);
251  $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor);
252  }
253  $array[] = $token;
254  $inside_tag = false;
255  continue;
256  }
257 
258  // Check if it is explicitly self closing, if so, remove
259  // trailing slash. Remember, we could have a tag like <br>, so
260  // any later token processing scripts must convert improperly
261  // classified EmptyTags from StartTags.
262  $is_self_closing = (strrpos($segment, '/') === $strlen_segment - 1);
263  if ($is_self_closing) {
264  $strlen_segment--;
265  $segment = substr($segment, 0, $strlen_segment);
266  }
267 
268  // Check if there are any attributes
269  $position_first_space = strcspn($segment, $this->_whitespace);
270 
271  if ($position_first_space >= $strlen_segment) {
272  if ($is_self_closing) {
273  $token = new HTMLPurifier_Token_Empty($segment);
274  } else {
275  $token = new HTMLPurifier_Token_Start($segment);
276  }
277  if ($maintain_line_numbers) {
278  $token->rawPosition($current_line, $current_col);
279  $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor);
280  }
281  $array[] = $token;
282  $inside_tag = false;
283  $cursor = $position_next_gt + 1;
284  continue;
285  }
286 
287  // Grab out all the data
288  $type = substr($segment, 0, $position_first_space);
289  $attribute_string =
290  trim(
291  substr(
292  $segment,
293  $position_first_space
294  )
295  );
296  if ($attribute_string) {
297  $attr = $this->parseAttributeString(
298  $attribute_string,
299  $config,
300  $context
301  );
302  } else {
303  $attr = array();
304  }
305 
306  if ($is_self_closing) {
307  $token = new HTMLPurifier_Token_Empty($type, $attr);
308  } else {
309  $token = new HTMLPurifier_Token_Start($type, $attr);
310  }
311  if ($maintain_line_numbers) {
312  $token->rawPosition($current_line, $current_col);
313  $current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor);
314  }
315  $array[] = $token;
316  $cursor = $position_next_gt + 1;
317  $inside_tag = false;
318  continue;
319  } else {
320  // inside tag, but there's no ending > sign
321  if ($e) {
322  $e->send(E_WARNING, 'Lexer: Missing gt');
323  }
324  $token = new
326  '<' .
327  $this->parseData(
328  substr($html, $cursor)
329  )
330  );
331  if ($maintain_line_numbers) {
332  $token->rawPosition($current_line, $current_col);
333  }
334  // no cursor scroll? Hmm...
335  $array[] = $token;
336  break;
337  }
338  break;
339  }
340 
341  $context->destroy('CurrentLine');
342  $context->destroy('CurrentCol');
343  return $array;
344  }
345 
354  protected function substrCount($haystack, $needle, $offset, $length)
355  {
356  static $oldVersion;
357  if ($oldVersion === null) {
358  $oldVersion = version_compare(PHP_VERSION, '5.1', '<');
359  }
360  if ($oldVersion) {
361  $haystack = substr($haystack, $offset, $length);
362  return substr_count($haystack, $needle);
363  } else {
364  return substr_count($haystack, $needle, $offset, $length);
365  }
366  }
367 
376  public function parseAttributeString($string, $config, $context)
377  {
378  $string = (string)$string; // quick typecast
379 
380  if ($string == '') {
381  return array();
382  } // no attributes
383 
384  $e = false;
385  if ($config->get('Core.CollectErrors')) {
386  $e =& $context->get('ErrorCollector');
387  }
388 
389  // let's see if we can abort as quickly as possible
390  // one equal sign, no spaces => one attribute
391  $num_equal = substr_count($string, '=');
392  $has_space = strpos($string, ' ');
393  if ($num_equal === 0 && !$has_space) {
394  // bool attribute
395  return array($string => $string);
396  } elseif ($num_equal === 1 && !$has_space) {
397  // only one attribute
398  list($key, $quoted_value) = explode('=', $string);
399  $quoted_value = trim($quoted_value);
400  if (!$key) {
401  if ($e) {
402  $e->send(E_ERROR, 'Lexer: Missing attribute key');
403  }
404  return array();
405  }
406  if (!$quoted_value) {
407  return array($key => '');
408  }
409  $first_char = @$quoted_value[0];
410  $last_char = @$quoted_value[strlen($quoted_value) - 1];
411 
412  $same_quote = ($first_char == $last_char);
413  $open_quote = ($first_char == '"' || $first_char == "'");
414 
415  if ($same_quote && $open_quote) {
416  // well behaved
417  $value = substr($quoted_value, 1, strlen($quoted_value) - 2);
418  } else {
419  // not well behaved
420  if ($open_quote) {
421  if ($e) {
422  $e->send(E_ERROR, 'Lexer: Missing end quote');
423  }
424  $value = substr($quoted_value, 1);
425  } else {
426  $value = $quoted_value;
427  }
428  }
429  if ($value === false) {
430  $value = '';
431  }
432  return array($key => $this->parseData($value));
433  }
434 
435  // setup loop environment
436  $array = array(); // return assoc array of attributes
437  $cursor = 0; // current position in string (moves forward)
438  $size = strlen($string); // size of the string (stays the same)
439 
440  // if we have unquoted attributes, the parser expects a terminating
441  // space, so let's guarantee that there's always a terminating space.
442  $string .= ' ';
443 
444  $old_cursor = -1;
445  while ($cursor < $size) {
446  if ($old_cursor >= $cursor) {
447  throw new Exception("Infinite loop detected");
448  }
449  $old_cursor = $cursor;
450 
451  $cursor += ($value = strspn($string, $this->_whitespace, $cursor));
452  // grab the key
453 
454  $key_begin = $cursor; //we're currently at the start of the key
455 
456  // scroll past all characters that are the key (not whitespace or =)
457  $cursor += strcspn($string, $this->_whitespace . '=', $cursor);
458 
459  $key_end = $cursor; // now at the end of the key
460 
461  $key = substr($string, $key_begin, $key_end - $key_begin);
462 
463  if (!$key) {
464  if ($e) {
465  $e->send(E_ERROR, 'Lexer: Missing attribute key');
466  }
467  $cursor += 1 + strcspn($string, $this->_whitespace, $cursor + 1); // prevent infinite loop
468  continue; // empty key
469  }
470 
471  // scroll past all whitespace
472  $cursor += strspn($string, $this->_whitespace, $cursor);
473 
474  if ($cursor >= $size) {
475  $array[$key] = $key;
476  break;
477  }
478 
479  // if the next character is an equal sign, we've got a regular
480  // pair, otherwise, it's a bool attribute
481  $first_char = @$string[$cursor];
482 
483  if ($first_char == '=') {
484  // key="value"
485 
486  $cursor++;
487  $cursor += strspn($string, $this->_whitespace, $cursor);
488 
489  if ($cursor === false) {
490  $array[$key] = '';
491  break;
492  }
493 
494  // we might be in front of a quote right now
495 
496  $char = @$string[$cursor];
497 
498  if ($char == '"' || $char == "'") {
499  // it's quoted, end bound is $char
500  $cursor++;
501  $value_begin = $cursor;
502  $cursor = strpos($string, $char, $cursor);
503  $value_end = $cursor;
504  } else {
505  // it's not quoted, end bound is whitespace
506  $value_begin = $cursor;
507  $cursor += strcspn($string, $this->_whitespace, $cursor);
508  $value_end = $cursor;
509  }
510 
511  // we reached a premature end
512  if ($cursor === false) {
513  $cursor = $size;
514  $value_end = $cursor;
515  }
516 
517  $value = substr($string, $value_begin, $value_end - $value_begin);
518  if ($value === false) {
519  $value = '';
520  }
521  $array[$key] = $this->parseData($value);
522  $cursor++;
523  } else {
524  // boolattr
525  if ($key !== '') {
526  $array[$key] = $key;
527  } else {
528  // purely theoretical
529  if ($e) {
530  $e->send(E_ERROR, 'Lexer: Missing attribute key');
531  }
532  }
533  }
534  }
535  return $array;
536  }
537 }
538 
539 // vim: et sw=4 sts=4




Korrekturen, Hinweise und Ergänzungen

Bitte scheuen Sie sich nicht und melden Sie, was auf dieser Seite sachlich falsch oder irreführend ist, was ergänzt werden sollte, was fehlt usw. Dazu bitte oben aus dem Menü Seite den Eintrag Support Forum wählen. Es ist eine kostenlose Anmeldung erforderlich, um Anmerkungen zu posten. Unpassende Postings, Spam usw. werden kommentarlos entfernt.