mirror of
				https://github.com/ezyang/htmlpurifier.git
				synced 2025-10-26 02:56:47 +02:00 
			
		
		
		
	git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1349 48356398-32a2-884e-a903-53898d9a118a
		
			
				
	
	
		
			202 lines
		
	
	
		
			7.4 KiB
		
	
	
	
		
			HTML
		
	
	
	
	
	
			
		
		
	
	
			202 lines
		
	
	
		
			7.4 KiB
		
	
	
	
		
			HTML
		
	
	
	
	
	
| <?xml version="1.0" encoding="UTF-8"?>
 | |
| <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
 | |
|     "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
 | |
| <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"><head>
 | |
| <meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
 | |
| <meta name="description" content="Tutorial for creating custom URI filters." />
 | |
| <link rel="stylesheet" type="text/css" href="style.css" />
 | |
| 
 | |
| <title>URI Filters - HTML Purifier</title>
 | |
| 
 | |
| </head><body>
 | |
| 
 | |
| <h1>URI Filters</h1>
 | |
| 
 | |
| <div id="filing">Filed under End-User</div>
 | |
| <div id="index">Return to the <a href="index.html">index</a>.</div>
 | |
| <div id="home"><a href="http://htmlpurifier.org/">HTML Purifier</a> End-User Documentation</div>
 | |
| 
 | |
| <p>
 | |
|   This is a quick and dirty document to get you on your way to writing
 | |
|   custom URI filters for your own URL filtering needs.  Why would you
 | |
|   want to write a URI filter?  If you need URIs your users put into
 | |
|   HTML to magically change into a different URI, this is
 | |
|   exactly what you need!
 | |
| </p>
 | |
| 
 | |
| <h2>Creating the class</h2>
 | |
| 
 | |
| <p>
 | |
|   Any URI filter you make will be a subclass of <code>HTMLPurifier_URIFilter</code>.
 | |
|   The scaffolding is thus:
 | |
| </p>
 | |
| 
 | |
| <pre>class HTMLPurifier_URIFilter_<strong>NameOfFilter</strong> extends HTMLPurifier_URIFilter
 | |
| {
 | |
|     var $name = '<strong>NameOfFilter</strong>';
 | |
|     function prepare($config) {}
 | |
|     function filter(&$uri, $config, &$context) {}
 | |
| }</pre>
 | |
| 
 | |
| <p>
 | |
|   Fill in the variable <code>$name</code> with the name of your filter, and
 | |
|   take a look at the two methods. <code>prepare()</code> is an initialization
 | |
|   method that is called only once, before any filtering has been done of the
 | |
|   HTML. Use it to perform any costly setup work that only needs to be done
 | |
|   once. <code>filter()</code> is the guts and innards of our filter:
 | |
|   it takes the URI and does whatever needs to be done to it.
 | |
| </p>
 | |
| 
 | |
| <p>
 | |
|   If you've worked with HTML Purifier, you'll recognize the <code>$config</code>
 | |
|   and <code>$context</code> parameters.  On the other hand, <code>$uri</code>
 | |
|   is something unique to this section of the application: it's a
 | |
|   <code>HTMLPurifier_URI</code> object. The interface is thus:
 | |
| </p>
 | |
| 
 | |
| <pre>class HTMLPurifier_URI
 | |
| {
 | |
|     var $scheme, $userinfo, $host, $port, $path, $query, $fragment;
 | |
|     function HTMLPurifier_URI($scheme, $userinfo, $host, $port, $path, $query, $fragment);
 | |
|     function toString();
 | |
|     function copy();
 | |
|     function getSchemeObj($config, &$context);
 | |
|     function validate($config, &$context);
 | |
| }</pre>
 | |
| 
 | |
| <p>
 | |
|   The first three methods are fairly self-explanatory: you have a constructor,
 | |
|   a serializer, and a cloner.  Generally, you won't be using them when
 | |
|   you are manipulating the URI objects themselves.
 | |
|   <code>getSchemeObj()</code> is a special purpose method that returns
 | |
|   a <code>HTMLPurifier_URIScheme</code> object corresponding to the specific
 | |
|   URI at hand. <code>validate()</code> performs general-purpose validation
 | |
|   on the internal components of a URI. Once again, you don't need to
 | |
|   worry about these: they've already been handled for you.
 | |
| </p>
 | |
| 
 | |
| <h2>URI format</h2>
 | |
| 
 | |
| <p>
 | |
|   As a URIFilter, we're interested in the member variables of the URI object.
 | |
| </p>
 | |
| 
 | |
| <table class="quick"><tbody>
 | |
|   <tr><th>Scheme</th>   <td>The protocol for identifying (and possibly locating) a resource (http, ftp, https)</td></tr>
 | |
|   <tr><th>Userinfo</th> <td>User information such as a username (bob)</td></tr>
 | |
|   <tr><th>Host</th>     <td>Domain name or IP address of the server (example.com, 127.0.0.1)</td></tr>
 | |
|   <tr><th>Port</th>     <td>Network port number for the server (80, 12345)</td></tr>
 | |
|   <tr><th>Path</th>     <td>Data that identifies the resource, possibly hierarchical (/path/to, ed@example.com)</td></tr>
 | |
|   <tr><th>Query</th>    <td>String of information to be interpreted by the resource (?q=search-term)</td></tr>
 | |
|   <tr><th>Fragment</th> <td>Additional information for the resource after retrieval (#bookmark)</td></tr>
 | |
| </tbody></table>
 | |
| 
 | |
| <p>
 | |
|   Because the URI is presented to us in this form, and not 
 | |
|   <code>http://bob@example.com:8080/foo.php?q=string#hash</code>, it saves us
 | |
|   a lot of trouble in having to parse the URI every time we want to filter
 | |
|   it. For the record, the above URI has the following components:
 | |
| </p>
 | |
| 
 | |
| <table class="quick"><tbody>
 | |
|   <tr><th>Scheme</th>   <td>http</td></tr>
 | |
|   <tr><th>Userinfo</th> <td>bob</td></tr>
 | |
|   <tr><th>Host</th>     <td>example.com</td></tr>
 | |
|   <tr><th>Port</th>     <td>8080</td></tr>
 | |
|   <tr><th>Path</th>     <td>/foo.php</td></tr>
 | |
|   <tr><th>Query</th>    <td>q=string</td></tr>
 | |
|   <tr><th>Fragment</th> <td>hash</td></tr>
 | |
| </tbody></table>
 | |
| 
 | |
| <p>
 | |
|   Note that there is no question mark or octothorpe in the query or
 | |
|   fragment: these get removed during parsing.
 | |
| </p>
 | |
| 
 | |
| <p>
 | |
|   With this information, you can get straight to implementing your
 | |
|   <code>filter()</code> method. But one more thing...
 | |
| </p>
 | |
| 
 | |
| <h2>Return value: Boolean, not URI</h2>
 | |
| 
 | |
| <p>
 | |
|   You may have noticed that the URI is being passed in by reference.
 | |
|   This means that whatever changes you make to it, those changes will
 | |
|   be reflected in the URI object the callee had.  <strong>Do not
 | |
|   return the URI object: it is unnecessary and will cause bugs.</strong>
 | |
|   Instead, return a boolean value, true if the filtering was successful,
 | |
|   or false if the URI is beyond repair and needs to be axed.
 | |
| </p>
 | |
| 
 | |
| <p>
 | |
|   Let's suppose I wanted to write a filter that de-internationalized domain
 | |
|   names by converting them to <a href="http://en.wikipedia.org/wiki/Punycode">Punycode</a>.
 | |
|   Assuming that <code>punycode_encode($input)</code> converts <code>$input</code> to
 | |
|   Punycode and returns <code>false</code> on failure:
 | |
| </p>
 | |
| 
 | |
| <pre>class HTMLPurifier_URIFilter_ConvertIDNToPunycode extends HTMLPurifier_URIFilter
 | |
| {
 | |
|     var $name = 'ConvertIDNToPunycode';
 | |
|     function filter(&$uri, $config, &$context) {
 | |
|         if (is_null($uri->host)) return true;
 | |
|         if ($uri->host == utf8_decode($uri->host)) {
 | |
|             // is ASCII, abort
 | |
|             return true;
 | |
|         }
 | |
|         $host = punycode_encode($uri->host);
 | |
|         if ($host === false) return false;
 | |
|         $uri->host = $host;
 | |
|         return true;
 | |
|     }
 | |
| }</pre>
 | |
| 
 | |
| <p>
 | |
|   Notice I did not <code>return $uri;</code>.
 | |
| </p>
 | |
| 
 | |
| <h2>Activating your filter</h2>
 | |
| 
 | |
| <p>
 | |
|   Having a filter is all well and good, but you need to tell HTML Purifier
 | |
|   to use it. Fortunately, this part's simple:
 | |
| </p>
 | |
| 
 | |
| <pre>$uri =& $config->getDefinition('URI');
 | |
| $uri->addFilter(new HTMLPurifier_URIFilter_<strong>NameOfFilter</strong>());</pre>
 | |
| 
 | |
| <p>
 | |
|   If you want to be really fancy, you can define a configuration directive
 | |
|   for your filter and have HTML Purifier automatically manage whether or
 | |
|   not your filter gets loaded or not (this is how internal filters manage
 | |
|   things):
 | |
| </p>
 | |
| 
 | |
| <pre>HTMLPurifier_ConfigSchema::define(
 | |
|     'URI', '<strong>NameOfFilter</strong>', false, 'bool',
 | |
|     '<strong>What your filter does.</strong>'
 | |
| );
 | |
| $uri =& $config->getDefinition('URI', true);
 | |
| $uri->registerFilter(new HTMLPurifier_URIFilter_<strong>NameOfFilter</strong>());
 | |
| </pre>
 | |
| 
 | |
| <p>
 | |
|   Now, your filter will only be called when %URI.<strong>NameOfFilter</strong>
 | |
|   is set to true.
 | |
| </p>
 | |
| 
 | |
| <h2>Examples</h2>
 | |
| 
 | |
| <p>
 | |
|   Check the
 | |
|   <a href="http://htmlpurifier.org/svnroot/htmlpurifier/trunk/library/HTMLPurifier/URIFilter/">URIFilter</a>
 | |
|   directory for more implementation examples, and see <a href="http://htmlpurifier.org/svnroot/htmlpurifier/trunk/docs/proposal-new-directives.txt">the
 | |
|   new directives proposal document</a> for ideas on what could be implemented
 | |
|   as a filter.
 | |
| </p>
 | |
| 
 | |
| <div id="version">$Id$</div>
 | |
| 
 | |
| </body></html>
 |