HTMLPurifier/Lexer.php Quellcode

Lexer.php
gehe zur Dokumentation dieser Datei
1 <?php
2 
43 {
44 
49  public $tracksLineNumbers = false;
50 
51  // -- STATIC ----------------------------------------------------------
52 
69  public static function create($config)
70  {
71  if (!($config instanceof HTMLPurifier_Config)) {
72  $lexer = $config;
73  trigger_error(
74  "Passing a prototype to
75  HTMLPurifier_Lexer::create() is deprecated, please instead
76  use %Core.LexerImpl",
77  E_USER_WARNING
78  );
79  } else {
80  $lexer = $config->get('Core.LexerImpl');
81  }
82 
83  $needs_tracking =
84  $config->get('Core.MaintainLineNumbers') ||
85  $config->get('Core.CollectErrors');
86 
87  $inst = null;
88  if (is_object($lexer)) {
89  $inst = $lexer;
90  } else {
91  if (is_null($lexer)) {
92  do {
93  // auto-detection algorithm
94  if ($needs_tracking) {
95  $lexer = 'DirectLex';
96  break;
97  }
98 
99  if (class_exists('DOMDocument') &&
100  method_exists('DOMDocument', 'loadHTML') &&
101  !extension_loaded('domxml')
102  ) {
103  // check for DOM support, because while it's part of the
104  // core, it can be disabled compile time. Also, the PECL
105  // domxml extension overrides the default DOM, and is evil
106  // and nasty and we shan't bother to support it
107  $lexer = 'DOMLex';
108  } else {
109  $lexer = 'DirectLex';
110  }
111  } while (0);
112  } // do..while so we can break
113 
114  // instantiate recognized string names
115  switch ($lexer) {
116  case 'DOMLex':
117  $inst = new HTMLPurifier_Lexer_DOMLex();
118  break;
119  case 'DirectLex':
120  $inst = new HTMLPurifier_Lexer_DirectLex();
121  break;
122  case 'PH5P':
123  $inst = new HTMLPurifier_Lexer_PH5P();
124  break;
125  default:
126  throw new HTMLPurifier_Exception(
127  "Cannot instantiate unrecognized Lexer type " .
128  htmlspecialchars($lexer)
129  );
130  }
131  }
132 
133  if (!$inst) {
134  throw new HTMLPurifier_Exception('No lexer was instantiated');
135  }
136 
137  // once PHP DOM implements native line numbers, or we
138  // hack out something using XSLT, remove this stipulation
139  if ($needs_tracking && !$inst->tracksLineNumbers) {
140  throw new HTMLPurifier_Exception(
141  'Cannot use lexer that does not support line numbers with ' .
142  'Core.MaintainLineNumbers or Core.CollectErrors (use DirectLex instead)'
143  );
144  }
145 
146  return $inst;
147 
148  }
149 
150  // -- CONVENIENCE MEMBERS ---------------------------------------------
151 
152  public function __construct()
153  {
154  $this->_entity_parser = new HTMLPurifier_EntityParser();
155  }
156 
162  array(
163  '&quot;' => '"',
164  '&amp;' => '&',
165  '&lt;' => '<',
166  '&gt;' => '>',
167  '&#39;' => "'",
168  '&#039;' => "'",
169  '&#x27;' => "'"
170  );
171 
186  public function parseData($string)
187  {
188  // following functions require at least one character
189  if ($string === '') {
190  return '';
191  }
192 
193  // subtracts amps that cannot possibly be escaped
194  $num_amp = substr_count($string, '&') - substr_count($string, '& ') -
195  ($string[strlen($string) - 1] === '&' ? 1 : 0);
196 
197  if (!$num_amp) {
198  return $string;
199  } // abort if no entities
200  $num_esc_amp = substr_count($string, '&amp;');
201  $string = strtr($string, $this->_special_entity2str);
202 
203  // code duplication for sake of optimization, see above
204  $num_amp_2 = substr_count($string, '&') - substr_count($string, '& ') -
205  ($string[strlen($string) - 1] === '&' ? 1 : 0);
206 
207  if ($num_amp_2 <= $num_esc_amp) {
208  return $string;
209  }
210 
211  // hmm... now we have some uncommon entities. Use the callback.
212  $string = $this->_entity_parser->substituteSpecialEntities($string);
213  return $string;
214  }
215 
223  public function tokenizeHTML($string, $config, $context)
224  {
225  trigger_error('Call to abstract class', E_USER_ERROR);
226  }
227 
233  protected static function escapeCDATA($string)
234  {
235  return preg_replace_callback(
236  '/<!\[CDATA\[(.+?)\]\]>/s',
237  array('HTMLPurifier_Lexer', 'CDATACallback'),
238  $string
239  );
240  }
241 
247  protected static function escapeCommentedCDATA($string)
248  {
249  return preg_replace_callback(
250  '#<!--//--><!\[CDATA\[//><!--(.+?)//--><!\]\]>#s',
251  array('HTMLPurifier_Lexer', 'CDATACallback'),
252  $string
253  );
254  }
255 
261  protected static function removeIEConditional($string)
262  {
263  return preg_replace(
264  '#<!--\[if [^>]+\]>.*?<!\[endif\]-->#si', // probably should generalize for all strings
265  '',
266  $string
267  );
268  }
269 
279  protected static function CDATACallback($matches)
280  {
281  // not exactly sure why the character set is needed, but whatever
282  return htmlspecialchars($matches[1], ENT_COMPAT, 'UTF-8');
283  }
284 
294  public function normalize($html, $config, $context)
295  {
296  // normalize newlines to \n
297  if ($config->get('Core.NormalizeNewlines')) {
298  $html = str_replace("\r\n", "\n", $html);
299  $html = str_replace("\r", "\n", $html);
300  }
301 
302  if ($config->get('HTML.Trusted')) {
303  // escape convoluted CDATA
304  $html = $this->escapeCommentedCDATA($html);
305  }
306 
307  // escape CDATA
308  $html = $this->escapeCDATA($html);
309 
310  $html = $this->removeIEConditional($html);
311 
312  // extract body from document if applicable
313  if ($config->get('Core.ConvertDocumentToFragment')) {
314  $e = false;
315  if ($config->get('Core.CollectErrors')) {
316  $e =& $context->get('ErrorCollector');
317  }
318  $new_html = $this->extractBody($html);
319  if ($e && $new_html != $html) {
320  $e->send(E_WARNING, 'Lexer: Extracted body');
321  }
322  $html = $new_html;
323  }
324 
325  // expand entities that aren't the big five
326  $html = $this->_entity_parser->substituteNonSpecialEntities($html);
327 
328  // clean into wellformed UTF-8 string for an SGML context: this has
329  // to be done after entity expansion because the entities sometimes
330  // represent non-SGML characters (horror, horror!)
331  $html = HTMLPurifier_Encoder::cleanUTF8($html);
332 
333  // if processing instructions are to removed, remove them now
334  if ($config->get('Core.RemoveProcessingInstructions')) {
335  $html = preg_replace('#<\?.+?\?>#s', '', $html);
336  }
337 
338  return $html;
339  }
340 
345  public function extractBody($html)
346  {
347  $matches = array();
348  $result = preg_match('!<body[^>]*>(.*)</body>!is', $html, $matches);
349  if ($result) {
350  return $matches[1];
351  } else {
352  return $html;
353  }
354  }
355 }
356 
357 // vim: et sw=4 sts=4




Korrekturen, Hinweise und Ergänzungen

Bitte scheuen Sie sich nicht und melden Sie, was auf dieser Seite sachlich falsch oder irreführend ist, was ergänzt werden sollte, was fehlt usw. Dazu bitte oben aus dem Menü Seite den Eintrag Support Forum wählen. Es ist eine kostenlose Anmeldung erforderlich, um Anmerkungen zu posten. Unpassende Postings, Spam usw. werden kommentarlos entfernt.