00001 <?php 00002 00042 class HTMLPurifier_Lexer 00043 { 00044 00049 public $tracksLineNumbers = false; 00050 00051 // -- STATIC ---------------------------------------------------------- 00052 00068 public static function create($config) { 00069 00070 if (!($config instanceof HTMLPurifier_Config)) { 00071 $lexer = $config; 00072 trigger_error("Passing a prototype to 00073 HTMLPurifier_Lexer::create() is deprecated, please instead 00074 use %Core.LexerImpl", E_USER_WARNING); 00075 } else { 00076 $lexer = $config->get('Core.LexerImpl'); 00077 } 00078 00079 $needs_tracking = 00080 $config->get('Core.MaintainLineNumbers') || 00081 $config->get('Core.CollectErrors'); 00082 00083 $inst = null; 00084 if (is_object($lexer)) { 00085 $inst = $lexer; 00086 } else { 00087 00088 if (is_null($lexer)) { do { 00089 // auto-detection algorithm 00090 00091 if ($needs_tracking) { 00092 $lexer = 'DirectLex'; 00093 break; 00094 } 00095 00096 if ( 00097 class_exists('DOMDocument') && 00098 method_exists('DOMDocument', 'loadHTML') && 00099 !extension_loaded('domxml') 00100 ) { 00101 // check for DOM support, because while it's part of the 00102 // core, it can be disabled compile time. Also, the PECL 00103 // domxml extension overrides the default DOM, and is evil 00104 // and nasty and we shan't bother to support it 00105 $lexer = 'DOMLex'; 00106 } else { 00107 $lexer = 'DirectLex'; 00108 } 00109 00110 } while(0); } // do..while so we can break 00111 00112 // instantiate recognized string names 00113 switch ($lexer) { 00114 case 'DOMLex': 00115 $inst = new HTMLPurifier_Lexer_DOMLex(); 00116 break; 00117 case 'DirectLex': 00118 $inst = new HTMLPurifier_Lexer_DirectLex(); 00119 break; 00120 case 'PH5P': 00121 $inst = new HTMLPurifier_Lexer_PH5P(); 00122 break; 00123 default: 00124 throw new HTMLPurifier_Exception("Cannot instantiate unrecognized Lexer type " . htmlspecialchars($lexer)); 00125 } 00126 } 00127 00128 if (!$inst) throw new HTMLPurifier_Exception('No lexer was instantiated'); 00129 00130 // once PHP DOM implements native line numbers, or we 00131 // hack out something using XSLT, remove this stipulation 00132 if ($needs_tracking && !$inst->tracksLineNumbers) { 00133 throw new HTMLPurifier_Exception('Cannot use lexer that does not support line numbers with Core.MaintainLineNumbers or Core.CollectErrors (use DirectLex instead)'); 00134 } 00135 00136 return $inst; 00137 00138 } 00139 00140 // -- CONVENIENCE MEMBERS --------------------------------------------- 00141 00142 public function __construct() { 00143 $this->_entity_parser = new HTMLPurifier_EntityParser(); 00144 } 00145 00149 protected $_special_entity2str = 00150 array( 00151 '"' => '"', 00152 '&' => '&', 00153 '<' => '<', 00154 '>' => '>', 00155 ''' => "'", 00156 ''' => "'", 00157 ''' => "'" 00158 ); 00159 00174 public function parseData($string) { 00175 00176 // following functions require at least one character 00177 if ($string === '') return ''; 00178 00179 // subtracts amps that cannot possibly be escaped 00180 $num_amp = substr_count($string, '&') - substr_count($string, '& ') - 00181 ($string[strlen($string)-1] === '&' ? 1 : 0); 00182 00183 if (!$num_amp) return $string; // abort if no entities 00184 $num_esc_amp = substr_count($string, '&'); 00185 $string = strtr($string, $this->_special_entity2str); 00186 00187 // code duplication for sake of optimization, see above 00188 $num_amp_2 = substr_count($string, '&') - substr_count($string, '& ') - 00189 ($string[strlen($string)-1] === '&' ? 1 : 0); 00190 00191 if ($num_amp_2 <= $num_esc_amp) return $string; 00192 00193 // hmm... now we have some uncommon entities. Use the callback. 00194 $string = $this->_entity_parser->substituteSpecialEntities($string); 00195 return $string; 00196 } 00197 00204 public function tokenizeHTML($string, $config, $context) { 00205 trigger_error('Call to abstract class', E_USER_ERROR); 00206 } 00207 00214 protected static function escapeCDATA($string) { 00215 return preg_replace_callback( 00216 '/<!\[CDATA\[(.+?)\]\]>/s', 00217 array('HTMLPurifier_Lexer', 'CDATACallback'), 00218 $string 00219 ); 00220 } 00221 00225 protected static function escapeCommentedCDATA($string) { 00226 return preg_replace_callback( 00227 '#<!--//--><!\[CDATA\[//><!--(.+?)//--><!\]\]>#s', 00228 array('HTMLPurifier_Lexer', 'CDATACallback'), 00229 $string 00230 ); 00231 } 00232 00242 protected static function CDATACallback($matches) { 00243 // not exactly sure why the character set is needed, but whatever 00244 return htmlspecialchars($matches[1], ENT_COMPAT, 'UTF-8'); 00245 } 00246 00252 public function normalize($html, $config, $context) { 00253 00254 // normalize newlines to \n 00255 $html = str_replace("\r\n", "\n", $html); 00256 $html = str_replace("\r", "\n", $html); 00257 00258 if ($config->get('HTML.Trusted')) { 00259 // escape convoluted CDATA 00260 $html = $this->escapeCommentedCDATA($html); 00261 } 00262 00263 // escape CDATA 00264 $html = $this->escapeCDATA($html); 00265 00266 // extract body from document if applicable 00267 if ($config->get('Core.ConvertDocumentToFragment')) { 00268 $html = $this->extractBody($html); 00269 } 00270 00271 // expand entities that aren't the big five 00272 $html = $this->_entity_parser->substituteNonSpecialEntities($html); 00273 00274 // clean into wellformed UTF-8 string for an SGML context: this has 00275 // to be done after entity expansion because the entities sometimes 00276 // represent non-SGML characters (horror, horror!) 00277 $html = HTMLPurifier_Encoder::cleanUTF8($html); 00278 00279 return $html; 00280 } 00281 00286 public function extractBody($html) { 00287 $matches = array(); 00288 $result = preg_match('!<body[^>]*>(.*)</body>!is', $html, $matches); 00289 if ($result) { 00290 return $matches[1]; 00291 } else { 00292 return $html; 00293 } 00294 } 00295 00296 } 00297 00298 // vim: et sw=4 sts=4
| Copyright © 2003 - 2009 MyOOS [Shopsystem]. All rights reserved. MyOOS [Shopsystem] is Free Software released under the GNU/GPL License. Webmaster: info@r23.de (Impressum) |
|
