1: <?php
2:
3: /**
4: * This is the HTML pseudo-parser for the Yadis library.
5: *
6: * PHP versions 4 and 5
7: *
8: * LICENSE: See the COPYING file included in this distribution.
9: *
10: * @package OpenID
11: * @author JanRain, Inc. <openid@janrain.com>
12: * @copyright 2005-2008 Janrain, Inc.
13: * @license http://www.apache.org/licenses/LICENSE-2.0 Apache
14: */
15:
16: /**
17: * This class is responsible for scanning an HTML string to find META
18: * tags and their attributes. This is used by the Yadis discovery
19: * process. This class must be instantiated to be used.
20: *
21: * @package OpenID
22: */
23: class Auth_Yadis_ParseHTML {
24:
25: /**
26: * @access private
27: */
28: var $_re_flags = "si";
29:
30: /**
31: * @access private
32: */
33: var $_removed_re =
34: "<!--.*?-->|<!\[CDATA\[.*?\]\]>|<script\b(?!:)[^>]*>.*?<\/script>";
35:
36: /**
37: * @access private
38: */
39: var $_tag_expr = "<%s%s(?:\s.*?)?%s>";
40:
41: /**
42: * @access private
43: */
44: var $_attr_find = '\b([-\w]+)=(".*?"|\'.*?\'|.+?)[\/\s>]';
45:
46: function Auth_Yadis_ParseHTML()
47: {
48: $this->_attr_find = sprintf("/%s/%s",
49: $this->_attr_find,
50: $this->_re_flags);
51:
52: $this->_removed_re = sprintf("/%s/%s",
53: $this->_removed_re,
54: $this->_re_flags);
55:
56: $this->_entity_replacements = array(
57: 'amp' => '&',
58: 'lt' => '<',
59: 'gt' => '>',
60: 'quot' => '"'
61: );
62:
63: $this->_ent_replace =
64: sprintf("&(%s);", implode("|",
65: $this->_entity_replacements));
66: }
67:
68: /**
69: * Replace HTML entities (amp, lt, gt, and quot) as well as
70: * numeric entities (e.g. #x9f;) with their actual values and
71: * return the new string.
72: *
73: * @access private
74: * @param string $str The string in which to look for entities
75: * @return string $new_str The new string entities decoded
76: */
77: function replaceEntities($str)
78: {
79: foreach ($this->_entity_replacements as $old => $new) {
80: $str = preg_replace(sprintf("/&%s;/", $old), $new, $str);
81: }
82:
83: // Replace numeric entities because html_entity_decode doesn't
84: // do it for us.
85: $str = preg_replace('~&#x([0-9a-f]+);~ei', 'chr(hexdec("\\1"))', $str);
86: $str = preg_replace('~&#([0-9]+);~e', 'chr(\\1)', $str);
87:
88: return $str;
89: }
90:
91: /**
92: * Strip single and double quotes off of a string, if they are
93: * present.
94: *
95: * @access private
96: * @param string $str The original string
97: * @return string $new_str The new string with leading and
98: * trailing quotes removed
99: */
100: function removeQuotes($str)
101: {
102: $matches = array();
103: $double = '/^"(.*)"$/';
104: $single = "/^\'(.*)\'$/";
105:
106: if (preg_match($double, $str, $matches)) {
107: return $matches[1];
108: } else if (preg_match($single, $str, $matches)) {
109: return $matches[1];
110: } else {
111: return $str;
112: }
113: }
114:
115: /**
116: * Create a regular expression that will match an opening
117: * or closing tag from a set of names.
118: *
119: * @access private
120: * @param mixed $tag_names Tag names to match
121: * @param mixed $close false/0 = no, true/1 = yes, other = maybe
122: * @param mixed $self_close false/0 = no, true/1 = yes, other = maybe
123: * @return string $regex A regular expression string to be used
124: * in, say, preg_match.
125: */
126: function tagPattern($tag_names, $close, $self_close)
127: {
128: if (is_array($tag_names)) {
129: $tag_names = '(?:'.implode('|',$tag_names).')';
130: }
131: if ($close) {
132: $close = '\/' . (($close == 1)? '' : '?');
133: } else {
134: $close = '';
135: }
136: if ($self_close) {
137: $self_close = '(?:\/\s*)' . (($self_close == 1)? '' : '?');
138: } else {
139: $self_close = '';
140: }
141: $expr = sprintf($this->_tag_expr, $close, $tag_names, $self_close);
142:
143: return sprintf("/%s/%s", $expr, $this->_re_flags);
144: }
145:
146: /**
147: * Given an HTML document string, this finds all the META tags in
148: * the document, provided they are found in the
149: * <HTML><HEAD>...</HEAD> section of the document. The <HTML> tag
150: * may be missing.
151: *
152: * @access private
153: * @param string $html_string An HTMl document string
154: * @return array $tag_list Array of tags; each tag is an array of
155: * attribute -> value.
156: */
157: function getMetaTags($html_string)
158: {
159: $html_string = preg_replace($this->_removed_re,
160: "",
161: $html_string);
162:
163: $key_tags = array($this->tagPattern('html', false, false),
164: $this->tagPattern('head', false, false),
165: $this->tagPattern('head', true, false),
166: $this->tagPattern('html', true, false),
167: $this->tagPattern(array(
168: 'body', 'frameset', 'frame', 'p', 'div',
169: 'table','span','a'), 'maybe', 'maybe'));
170: $key_tags_pos = array();
171: foreach ($key_tags as $pat) {
172: $matches = array();
173: preg_match($pat, $html_string, $matches, PREG_OFFSET_CAPTURE);
174: if($matches) {
175: $key_tags_pos[] = $matches[0][1];
176: } else {
177: $key_tags_pos[] = null;
178: }
179: }
180: // no opening head tag
181: if (is_null($key_tags_pos[1])) {
182: return array();
183: }
184: // the effective </head> is the min of the following
185: if (is_null($key_tags_pos[2])) {
186: $key_tags_pos[2] = strlen($html_string);
187: }
188: foreach (array($key_tags_pos[3], $key_tags_pos[4]) as $pos) {
189: if (!is_null($pos) && $pos < $key_tags_pos[2]) {
190: $key_tags_pos[2] = $pos;
191: }
192: }
193: // closing head tag comes before opening head tag
194: if ($key_tags_pos[1] > $key_tags_pos[2]) {
195: return array();
196: }
197: // if there is an opening html tag, make sure the opening head tag
198: // comes after it
199: if (!is_null($key_tags_pos[0]) && $key_tags_pos[1] < $key_tags_pos[0]) {
200: return array();
201: }
202: $html_string = substr($html_string, $key_tags_pos[1],
203: ($key_tags_pos[2]-$key_tags_pos[1]));
204:
205: $link_data = array();
206: $link_matches = array();
207:
208: if (!preg_match_all($this->tagPattern('meta', false, 'maybe'),
209: $html_string, $link_matches)) {
210: return array();
211: }
212:
213: foreach ($link_matches[0] as $link) {
214: $attr_matches = array();
215: preg_match_all($this->_attr_find, $link, $attr_matches);
216: $link_attrs = array();
217: foreach ($attr_matches[0] as $index => $full_match) {
218: $name = $attr_matches[1][$index];
219: $value = $this->replaceEntities(
220: $this->removeQuotes($attr_matches[2][$index]));
221:
222: $link_attrs[strtolower($name)] = $value;
223: }
224: $link_data[] = $link_attrs;
225: }
226:
227: return $link_data;
228: }
229:
230: /**
231: * Looks for a META tag with an "http-equiv" attribute whose value
232: * is one of ("x-xrds-location", "x-yadis-location"), ignoring
233: * case. If such a META tag is found, its "content" attribute
234: * value is returned.
235: *
236: * @param string $html_string An HTML document in string format
237: * @return mixed $content The "content" attribute value of the
238: * META tag, if found, or null if no such tag was found.
239: */
240: function getHTTPEquiv($html_string)
241: {
242: $meta_tags = $this->getMetaTags($html_string);
243:
244: if ($meta_tags) {
245: foreach ($meta_tags as $tag) {
246: if (array_key_exists('http-equiv', $tag) &&
247: (in_array(strtolower($tag['http-equiv']),
248: array('x-xrds-location', 'x-yadis-location'))) &&
249: array_key_exists('content', $tag)) {
250: return $tag['content'];
251: }
252: }
253: }
254:
255: return null;
256: }
257: }
258:
259: