HTMLPurifier/Encoder.php Quellcode

Encoder.php
gehe zur Dokumentation dieser Datei
1 <?php
2 
8 {
9 
13  private function __construct()
14  {
15  trigger_error('Cannot instantiate encoder, call methods statically', E_USER_ERROR);
16  }
17 
21  public static function muteErrorHandler()
22  {
23  }
24 
32  public static function unsafeIconv($in, $out, $text)
33  {
34  set_error_handler(array('HTMLPurifier_Encoder', 'muteErrorHandler'));
35  $r = iconv($in, $out, $text);
36  restore_error_handler();
37  return $r;
38  }
39 
48  public static function iconv($in, $out, $text, $max_chunk_size = 8000)
49  {
50  $code = self::testIconvTruncateBug();
51  if ($code == self::ICONV_OK) {
52  return self::unsafeIconv($in, $out, $text);
53  } elseif ($code == self::ICONV_TRUNCATES) {
54  // we can only work around this if the input character set
55  // is utf-8
56  if ($in == 'utf-8') {
57  if ($max_chunk_size < 4) {
58  trigger_error('max_chunk_size is too small', E_USER_WARNING);
59  return false;
60  }
61  // split into 8000 byte chunks, but be careful to handle
62  // multibyte boundaries properly
63  if (($c = strlen($text)) <= $max_chunk_size) {
64  return self::unsafeIconv($in, $out, $text);
65  }
66  $r = '';
67  $i = 0;
68  while (true) {
69  if ($i + $max_chunk_size >= $c) {
70  $r .= self::unsafeIconv($in, $out, substr($text, $i));
71  break;
72  }
73  // wibble the boundary
74  if (0x80 != (0xC0 & ord($text[$i + $max_chunk_size]))) {
75  $chunk_size = $max_chunk_size;
76  } elseif (0x80 != (0xC0 & ord($text[$i + $max_chunk_size - 1]))) {
77  $chunk_size = $max_chunk_size - 1;
78  } elseif (0x80 != (0xC0 & ord($text[$i + $max_chunk_size - 2]))) {
79  $chunk_size = $max_chunk_size - 2;
80  } elseif (0x80 != (0xC0 & ord($text[$i + $max_chunk_size - 3]))) {
81  $chunk_size = $max_chunk_size - 3;
82  } else {
83  return false; // rather confusing UTF-8...
84  }
85  $chunk = substr($text, $i, $chunk_size); // substr doesn't mind overlong lengths
86  $r .= self::unsafeIconv($in, $out, $chunk);
87  $i += $chunk_size;
88  }
89  return $r;
90  } else {
91  return false;
92  }
93  } else {
94  return false;
95  }
96  }
97 
127  public static function cleanUTF8($str, $force_php = false)
128  {
129  // UTF-8 validity is checked since PHP 4.3.5
130  // This is an optimization: if the string is already valid UTF-8, no
131  // need to do PHP stuff. 99% of the time, this will be the case.
132  // The regexp matches the XML char production, as well as well as excluding
133  // non-SGML codepoints U+007F to U+009F
134  if (preg_match(
135  '/^[\x{9}\x{A}\x{D}\x{20}-\x{7E}\x{A0}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}]*$/Du',
136  $str
137  )) {
138  return $str;
139  }
140 
141  $mState = 0; // cached expected number of octets after the current octet
142  // until the beginning of the next UTF8 character sequence
143  $mUcs4 = 0; // cached Unicode character
144  $mBytes = 1; // cached expected number of octets in the current sequence
145 
146  // original code involved an $out that was an array of Unicode
147  // codepoints. Instead of having to convert back into UTF-8, we've
148  // decided to directly append valid UTF-8 characters onto a string
149  // $out once they're done. $char accumulates raw bytes, while $mUcs4
150  // turns into the Unicode code point, so there's some redundancy.
151 
152  $out = '';
153  $char = '';
154 
155  $len = strlen($str);
156  for ($i = 0; $i < $len; $i++) {
157  $in = ord($str{$i});
158  $char .= $str[$i]; // append byte to char
159  if (0 == $mState) {
160  // When mState is zero we expect either a US-ASCII character
161  // or a multi-octet sequence.
162  if (0 == (0x80 & ($in))) {
163  // US-ASCII, pass straight through.
164  if (($in <= 31 || $in == 127) &&
165  !($in == 9 || $in == 13 || $in == 10) // save \r\t\n
166  ) {
167  // control characters, remove
168  } else {
169  $out .= $char;
170  }
171  // reset
172  $char = '';
173  $mBytes = 1;
174  } elseif (0xC0 == (0xE0 & ($in))) {
175  // First octet of 2 octet sequence
176  $mUcs4 = ($in);
177  $mUcs4 = ($mUcs4 & 0x1F) << 6;
178  $mState = 1;
179  $mBytes = 2;
180  } elseif (0xE0 == (0xF0 & ($in))) {
181  // First octet of 3 octet sequence
182  $mUcs4 = ($in);
183  $mUcs4 = ($mUcs4 & 0x0F) << 12;
184  $mState = 2;
185  $mBytes = 3;
186  } elseif (0xF0 == (0xF8 & ($in))) {
187  // First octet of 4 octet sequence
188  $mUcs4 = ($in);
189  $mUcs4 = ($mUcs4 & 0x07) << 18;
190  $mState = 3;
191  $mBytes = 4;
192  } elseif (0xF8 == (0xFC & ($in))) {
193  // First octet of 5 octet sequence.
194  //
195  // This is illegal because the encoded codepoint must be
196  // either:
197  // (a) not the shortest form or
198  // (b) outside the Unicode range of 0-0x10FFFF.
199  // Rather than trying to resynchronize, we will carry on
200  // until the end of the sequence and let the later error
201  // handling code catch it.
202  $mUcs4 = ($in);
203  $mUcs4 = ($mUcs4 & 0x03) << 24;
204  $mState = 4;
205  $mBytes = 5;
206  } elseif (0xFC == (0xFE & ($in))) {
207  // First octet of 6 octet sequence, see comments for 5
208  // octet sequence.
209  $mUcs4 = ($in);
210  $mUcs4 = ($mUcs4 & 1) << 30;
211  $mState = 5;
212  $mBytes = 6;
213  } else {
214  // Current octet is neither in the US-ASCII range nor a
215  // legal first octet of a multi-octet sequence.
216  $mState = 0;
217  $mUcs4 = 0;
218  $mBytes = 1;
219  $char = '';
220  }
221  } else {
222  // When mState is non-zero, we expect a continuation of the
223  // multi-octet sequence
224  if (0x80 == (0xC0 & ($in))) {
225  // Legal continuation.
226  $shift = ($mState - 1) * 6;
227  $tmp = $in;
228  $tmp = ($tmp & 0x0000003F) << $shift;
229  $mUcs4 |= $tmp;
230 
231  if (0 == --$mState) {
232  // End of the multi-octet sequence. mUcs4 now contains
233  // the final Unicode codepoint to be output
234 
235  // Check for illegal sequences and codepoints.
236 
237  // From Unicode 3.1, non-shortest form is illegal
238  if (((2 == $mBytes) && ($mUcs4 < 0x0080)) ||
239  ((3 == $mBytes) && ($mUcs4 < 0x0800)) ||
240  ((4 == $mBytes) && ($mUcs4 < 0x10000)) ||
241  (4 < $mBytes) ||
242  // From Unicode 3.2, surrogate characters = illegal
243  (($mUcs4 & 0xFFFFF800) == 0xD800) ||
244  // Codepoints outside the Unicode range are illegal
245  ($mUcs4 > 0x10FFFF)
246  ) {
247 
248  } elseif (0xFEFF != $mUcs4 && // omit BOM
249  // check for valid Char unicode codepoints
250  (
251  0x9 == $mUcs4 ||
252  0xA == $mUcs4 ||
253  0xD == $mUcs4 ||
254  (0x20 <= $mUcs4 && 0x7E >= $mUcs4) ||
255  // 7F-9F is not strictly prohibited by XML,
256  // but it is non-SGML, and thus we don't allow it
257  (0xA0 <= $mUcs4 && 0xD7FF >= $mUcs4) ||
258  (0x10000 <= $mUcs4 && 0x10FFFF >= $mUcs4)
259  )
260  ) {
261  $out .= $char;
262  }
263  // initialize UTF8 cache (reset)
264  $mState = 0;
265  $mUcs4 = 0;
266  $mBytes = 1;
267  $char = '';
268  }
269  } else {
270  // ((0xC0 & (*in) != 0x80) && (mState != 0))
271  // Incomplete multi-octet sequence.
272  // used to result in complete fail, but we'll reset
273  $mState = 0;
274  $mUcs4 = 0;
275  $mBytes = 1;
276  $char ='';
277  }
278  }
279  }
280  return $out;
281  }
282 
296  // +----------+----------+----------+----------+
297  // | 33222222 | 22221111 | 111111 | |
298  // | 10987654 | 32109876 | 54321098 | 76543210 | bit
299  // +----------+----------+----------+----------+
300  // | | | | 0xxxxxxx | 1 byte 0x00000000..0x0000007F
301  // | | | 110yyyyy | 10xxxxxx | 2 byte 0x00000080..0x000007FF
302  // | | 1110zzzz | 10yyyyyy | 10xxxxxx | 3 byte 0x00000800..0x0000FFFF
303  // | 11110www | 10wwzzzz | 10yyyyyy | 10xxxxxx | 4 byte 0x00010000..0x0010FFFF
304  // +----------+----------+----------+----------+
305  // | 00000000 | 00011111 | 11111111 | 11111111 | Theoretical upper limit of legal scalars: 2097151 (0x001FFFFF)
306  // | 00000000 | 00010000 | 11111111 | 11111111 | Defined upper limit of legal scalar codes
307  // +----------+----------+----------+----------+
308 
309  public static function unichr($code)
310  {
311  if ($code > 1114111 or $code < 0 or
312  ($code >= 55296 and $code <= 57343) ) {
313  // bits are set outside the "valid" range as defined
314  // by UNICODE 4.1.0
315  return '';
316  }
317 
318  $x = $y = $z = $w = 0;
319  if ($code < 128) {
320  // regular ASCII character
321  $x = $code;
322  } else {
323  // set up bits for UTF-8
324  $x = ($code & 63) | 128;
325  if ($code < 2048) {
326  $y = (($code & 2047) >> 6) | 192;
327  } else {
328  $y = (($code & 4032) >> 6) | 128;
329  if ($code < 65536) {
330  $z = (($code >> 12) & 15) | 224;
331  } else {
332  $z = (($code >> 12) & 63) | 128;
333  $w = (($code >> 18) & 7) | 240;
334  }
335  }
336  }
337  // set up the actual character
338  $ret = '';
339  if ($w) {
340  $ret .= chr($w);
341  }
342  if ($z) {
343  $ret .= chr($z);
344  }
345  if ($y) {
346  $ret .= chr($y);
347  }
348  $ret .= chr($x);
349 
350  return $ret;
351  }
352 
356  public static function iconvAvailable()
357  {
358  static $iconv = null;
359  if ($iconv === null) {
360  $iconv = function_exists('iconv') && self::testIconvTruncateBug() != self::ICONV_UNUSABLE;
361  }
362  return $iconv;
363  }
364 
372  public static function convertToUTF8($str, $config, $context)
373  {
374  $encoding = $config->get('Core.Encoding');
375  if ($encoding === 'utf-8') {
376  return $str;
377  }
378  static $iconv = null;
379  if ($iconv === null) {
380  $iconv = self::iconvAvailable();
381  }
382  if ($iconv && !$config->get('Test.ForceNoIconv')) {
383  // unaffected by bugs, since UTF-8 support all characters
384  $str = self::unsafeIconv($encoding, 'utf-8//IGNORE', $str);
385  if ($str === false) {
386  // $encoding is not a valid encoding
387  trigger_error('Invalid encoding ' . $encoding, E_USER_ERROR);
388  return '';
389  }
390  // If the string is bjorked by Shift_JIS or a similar encoding
391  // that doesn't support all of ASCII, convert the naughty
392  // characters to their true byte-wise ASCII/UTF-8 equivalents.
393  $str = strtr($str, self::testEncodingSupportsASCII($encoding));
394  return $str;
395  } elseif ($encoding === 'iso-8859-1') {
396  $str = utf8_encode($str);
397  return $str;
398  }
400  if ($bug == self::ICONV_OK) {
401  trigger_error('Encoding not supported, please install iconv', E_USER_ERROR);
402  } else {
403  trigger_error(
404  'You have a buggy version of iconv, see https://bugs.php.net/bug.php?id=48147 ' .
405  'and http://sourceware.org/bugzilla/show_bug.cgi?id=13541',
406  E_USER_ERROR
407  );
408  }
409  }
410 
420  public static function convertFromUTF8($str, $config, $context)
421  {
422  $encoding = $config->get('Core.Encoding');
423  if ($escape = $config->get('Core.EscapeNonASCIICharacters')) {
424  $str = self::convertToASCIIDumbLossless($str);
425  }
426  if ($encoding === 'utf-8') {
427  return $str;
428  }
429  static $iconv = null;
430  if ($iconv === null) {
431  $iconv = self::iconvAvailable();
432  }
433  if ($iconv && !$config->get('Test.ForceNoIconv')) {
434  // Undo our previous fix in convertToUTF8, otherwise iconv will barf
435  $ascii_fix = self::testEncodingSupportsASCII($encoding);
436  if (!$escape && !empty($ascii_fix)) {
437  $clear_fix = array();
438  foreach ($ascii_fix as $utf8 => $native) {
439  $clear_fix[$utf8] = '';
440  }
441  $str = strtr($str, $clear_fix);
442  }
443  $str = strtr($str, array_flip($ascii_fix));
444  // Normal stuff
445  $str = self::iconv('utf-8', $encoding . '//IGNORE', $str);
446  return $str;
447  } elseif ($encoding === 'iso-8859-1') {
448  $str = utf8_decode($str);
449  return $str;
450  }
451  trigger_error('Encoding not supported', E_USER_ERROR);
452  // You might be tempted to assume that the ASCII representation
453  // might be OK, however, this is *not* universally true over all
454  // encodings. So we take the conservative route here, rather
455  // than forcibly turn on %Core.EscapeNonASCIICharacters
456  }
457 
474  public static function convertToASCIIDumbLossless($str)
475  {
476  $bytesleft = 0;
477  $result = '';
478  $working = 0;
479  $len = strlen($str);
480  for ($i = 0; $i < $len; $i++) {
481  $bytevalue = ord($str[$i]);
482  if ($bytevalue <= 0x7F) { //0xxx xxxx
483  $result .= chr($bytevalue);
484  $bytesleft = 0;
485  } elseif ($bytevalue <= 0xBF) { //10xx xxxx
486  $working = $working << 6;
487  $working += ($bytevalue & 0x3F);
488  $bytesleft--;
489  if ($bytesleft <= 0) {
490  $result .= "&#" . $working . ";";
491  }
492  } elseif ($bytevalue <= 0xDF) { //110x xxxx
493  $working = $bytevalue & 0x1F;
494  $bytesleft = 1;
495  } elseif ($bytevalue <= 0xEF) { //1110 xxxx
496  $working = $bytevalue & 0x0F;
497  $bytesleft = 2;
498  } else { //1111 0xxx
499  $working = $bytevalue & 0x07;
500  $bytesleft = 3;
501  }
502  }
503  return $result;
504  }
505 
507  const ICONV_OK = 0;
508 
511  const ICONV_TRUNCATES = 1;
512 
515  const ICONV_UNUSABLE = 2;
516 
531  public static function testIconvTruncateBug()
532  {
533  static $code = null;
534  if ($code === null) {
535  // better not use iconv, otherwise infinite loop!
536  $r = self::unsafeIconv('utf-8', 'ascii//IGNORE', "\xCE\xB1" . str_repeat('a', 9000));
537  if ($r === false) {
538  $code = self::ICONV_UNUSABLE;
539  } elseif (($c = strlen($r)) < 9000) {
540  $code = self::ICONV_TRUNCATES;
541  } elseif ($c > 9000) {
542  trigger_error(
543  'Your copy of iconv is extremely buggy. Please notify HTML Purifier maintainers: ' .
544  'include your iconv version as per phpversion()',
545  E_USER_ERROR
546  );
547  } else {
548  $code = self::ICONV_OK;
549  }
550  }
551  return $code;
552  }
553 
565  public static function testEncodingSupportsASCII($encoding, $bypass = false)
566  {
567  // All calls to iconv here are unsafe, proof by case analysis:
568  // If ICONV_OK, no difference.
569  // If ICONV_TRUNCATE, all calls involve one character inputs,
570  // so bug is not triggered.
571  // If ICONV_UNUSABLE, this call is irrelevant
572  static $encodings = array();
573  if (!$bypass) {
574  if (isset($encodings[$encoding])) {
575  return $encodings[$encoding];
576  }
577  $lenc = strtolower($encoding);
578  switch ($lenc) {
579  case 'shift_jis':
580  return array("\xC2\xA5" => '\\', "\xE2\x80\xBE" => '~');
581  case 'johab':
582  return array("\xE2\x82\xA9" => '\\');
583  }
584  if (strpos($lenc, 'iso-8859-') === 0) {
585  return array();
586  }
587  }
588  $ret = array();
589  if (self::unsafeIconv('UTF-8', $encoding, 'a') === false) {
590  return false;
591  }
592  for ($i = 0x20; $i <= 0x7E; $i++) { // all printable ASCII chars
593  $c = chr($i); // UTF-8 char
594  $r = self::unsafeIconv('UTF-8', "$encoding//IGNORE", $c); // initial conversion
595  if ($r === '' ||
596  // This line is needed for iconv implementations that do not
597  // omit characters that do not exist in the target character set
598  ($r === $c && self::unsafeIconv($encoding, 'UTF-8//IGNORE', $r) !== $c)
599  ) {
600  // Reverse engineer: what's the UTF-8 equiv of this byte
601  // sequence? This assumes that there's no variable width
602  // encoding that doesn't support ASCII.
603  $ret[self::unsafeIconv($encoding, 'UTF-8//IGNORE', $c)] = $c;
604  }
605  }
606  $encodings[$encoding] = $ret;
607  return $ret;
608  }
609 }
610 
611 // vim: et sw=4 sts=4




Korrekturen, Hinweise und Ergänzungen

Bitte scheuen Sie sich nicht und melden Sie, was auf dieser Seite sachlich falsch oder irreführend ist, was ergänzt werden sollte, was fehlt usw. Dazu bitte oben aus dem Menü Seite den Eintrag Support Forum wählen. Es ist eine kostenlose Anmeldung erforderlich, um Anmerkungen zu posten. Unpassende Postings, Spam usw. werden kommentarlos entfernt.