GsgXml.class.php
12.9 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
<?php
/**
* Copyright 2005 Zervaas Enterprises (www.zervaas.com.au)
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* GsgXml
*
* A PHP class for generating XML data for the Google Sitemaps service.
* This includes options for compressing the output
*
* @author Quentin Zervaas
* @version 1.0
*/
class GsgXml
{
/**
* Version of the Google Sitemap this class is for
*/
var $sitemapVersion = '0.84';
/**
* URL to the namespace for Google Sitemaps
*/
var $namespace = 'http://www.google.com/schemas/sitemap/0.84';
/**
* An array to hold the URL data in prior to generating XML
*/
var $urls = array();
/**
* Character encoding to use in output XML
*/
var $xmlEncoding = 'UTF-8';
/**
* The different values allowed for change frequency, if specified
*/
var $changeFreqs = array('always', 'hourly', 'daily', 'weekly',
'monthly', 'yearly', 'never');
/**
* Maximum length a URL can be
*/
var $maxUrlLen = 2048;
/**
* The range of values priority can be
*/
var $priorityMin = 0.0;
var $priorityMax = 1.0;
var $priorityStep = 0.1;
var $priorityFormat = '%01.1f';
/**
* Format strings for representing last modified timestamps, using
* gmdate() to format the strings. PHP's timezone flag outputs
* in the format '+0000' rather than '+00:00'
*/
var $lastModDate = 'Y-m-d';
var $lastModDateTime = 'Y-m-d\TH:i:s';
var $compressFunc = 'gzencode';
/**
* Maximum number of URLs that can be specified
*/
var $maxURLs = 50000;
var $errorMsg = '';
/**
* GsgXml constructor
*
* The constructor optionally allows you to specify a base URL, which
* serves two purposes: 1) allows you to add URLs with paths only, and
* auto-prepend the base URL in front of it, and 2) ensures added URLs
* that include domain information match this base URL, as all URLs
* in a single sitemap must be on the same scheme (http/https) and
* domain.
*
* @access public
* @param string $baseUrl Optional. The base URL for added URLs.
* It is intended for a value such as
* http://www.example.com however including
* extra path info will work also, but if
* extra path is included all added URLs
* must be within this directory.
*/
function GsgXml($baseUrl = '')
{
$this->baseUrl = strtolower($baseUrl);
$this->baseUrlLen = strlen($this->baseUrl); // cycle saver
}
/**
* addUrl
*
* Adds a URL to the sitemap. All data is optional, except for the actual
* URL. The URL can include the domain info, but if it doesn't then the
* $pathOnly parameter should be set to true. The last modified timestamp
* can be either a date and time, or a date only.
*
* @access public
* @param string $url The URL to add. The URL should not be escaped.
* @param bool $pathOnly Set to true if the URL contains only a path,
* or leave at false if it contains the domain
* @param int $lastModTs The Unix/Epoch timestamp since the doc was modified.
* Set this to null to not include this parameter.
* @param bool $lastModTsDateOnly If $lastModTs is specified, then setting
* this to true means the timestamp date will
* only be output, not the time.
* @param string $changeFreq The frequency this URL is changed. Must be a valid
* value from class $changeFreqs, otherwise ignored.
* @param float $priority The priority of this URL relative to other URLs
* in the sitemap. Must be between $priorityMin and
* $priorityMax inclusive, and a multiple of $priorityStep
* @return bool True if URL was added, false if not (e.g. if didn't match $baseUrl)
*/
function addUrl($url, $pathOnly = false, $lastModTs = null, $lastModTsDateOnly = false,
$changeFreq = null, $priority = null)
{
$this->errorMsg = '';
if (count($this->urls) >= $this->maxURLs) {
$this->errorMsg = "Only ".$this->maxURLs . " urls are allowed within a Google Sitemaps file.";
return false;
}
if ($pathOnly) {
$url = $this->baseUrl . $url;
$url = substr($url, 0, $this->baseUrlLen-1).preg_replace('|/+|', '/', substr($url, $this->baseUrlLen -1)); // replace double slashes with a single slash
}
else if ($this->baseUrlLen > 0) {
// check if the added URL matches the baseUrl
if ($this->baseUrl != strtolower(substr($url, 0, $this->baseUrlLen))) {
$this->errorMsg = 'The following url does not match the base url ('.$this->baseUrl.'): ' . $url . '!';
return false;
}
}
$data = array('url' => $url);
if (($lastModTs != '') && !is_null($lastModTs) && is_numeric($lastModTs)) {
$data['lastmod'] = (int) $lastModTs;
$data['lastmod_dateonly'] = (bool) $lastModTsDateOnly;
} elseif(is_string($lastModTs)) {
// ts could be a preformated string
if ($lastModTs != '') {
$data['lastmod'] = $lastModTs;
$data['lastmod_dateonly'] = false;
}
}
if (!is_null($changeFreq) && in_array($changeFreq, $this->changeFreqs)) {
$data['changefreq'] = $changeFreq;
}
if (!is_null($priority) && $priority != '') {
$priority = (float) $priority;
// ensure it's between the valid range, else ignore it
if ($priority >= $this->priorityMin && $priority <= $this->priorityMax) {
// ok it's valid, now normalize the value
$tmp = floor($priority / $this->priorityStep);
$tmp = $priority - $tmp * $this->priorityStep;
$priority -= $tmp;
$data['priority'] = $priority;
}
}
$this->urls[] = $data;
}
/**
* output
*
* Output the generated XML. The data can either be returned or output
* directly. If it is not being returned (i.e. being output directly
* then you can optionally output the HTTP headers for the data
*
* @access public
* @param bool $return Optional. True to return the XML, false to
* output it directly. If this is true, then
* the $sendHeaders parameter is ignored
* @param bool $compress Optional. True to compress the data using gzip
* @param bool $sendHeaders Optional. True to send HTTP headers. This
* parameter is only used if $return is true
* @return mixed void is $return is false, XML string if $return
* true and $compress false, gzip binary data
* if $return true and $compress true
*/
function output($return = true, $compress = false, $sendHeaders = false)
{
$xml = $this->generateXml();
if ($compress)
$compress = function_exists($this->compressFunc);
if ($compress)
$xml = $this->compress($xml);
if ($return) {
return $xml;
}
else {
if ($sendHeaders) {
if ($compress)
$mime = 'application/x-gzip';
else
$mime = 'text/xml';
header('Content-type: ' . $mime);
header('Content-length: ' . strlen($xml));
}
echo $xml;
}
}
/**
* generateXml
*
* Builds the sitemap XML from all the added URLs
*
* @access public
* @return string The generated XML
*/
function generateXml()
{
$ret = array();
$ret[] = sprintf('<?xml version="1.0" encoding="%s"?>', $this->xmlEncoding);
$ret[] = sprintf('<urlset xmlns="http://www.google.com/schemas/sitemap/0.84"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://www.google.com/schemas/sitemap/0.84 http://www.google.com/schemas/sitemap/0.84/sitemap.xsd">');
$ret[] = sprintf('<!-- Created by phpSitemapNG version %s -->', PSNG_VERSION);
$ret[] = sprintf('<!-- Last update of sitemap %s -->', date($this->lastModDateTime).substr(date("O"),0,3).":".substr(date("O"),3));
foreach ($this->urls as $url) {
$ret[] = '<url>';
$ret[] = sprintf('<loc>%s</loc>', $this->xmlEscape($url['url']));
if (isset($url['lastmod'])) {
if (is_numeric($url['lastmod'])) {
$ret[] = sprintf('<lastmod>%s</lastmod>',
$url['lastmod_dateonly'] ?
date($this->lastModDate, $url['lastmod']) :
date($this->lastModDateTime, $url['lastmod']).
substr(date("O", $url['lastmod']),0,3) . ":" .
substr(date("O",$url['lastmod']),3));
} elseif (is_string($url['lastmod'])) {
$ret[] = sprintf('<lastmod>%s</lastmod>',$url['lastmod']);
}
}
if (isset($url['changefreq'])) {
$ret[] = sprintf('<changefreq>%s</changefreq>',
$this->xmlEscape($url['changefreq']));
}
if (isset($url['priority'])) {
$priorityStr = sprintf('<priority>%s</priority>', $this->priorityFormat);
$ret[] = sprintf($priorityStr, $url['priority']);
}
$ret[] = '</url>';
}
$ret[] = '</urlset>';
return join("\n", $ret);
}
/**
* compress
*
* Compresses a text string with GZIP, and returns the compressed data
*
* @access public
* @param string The string to compress
* @return The compressed gzip data, or null if the compression callback is not found
*/
function compress($string)
{
$func = $this->compressFunc;
if (strlen($func) == 0 || !function_exists($func))
return null;
return $func($string);
}
/**
* xmlEscape
*
* Escapes a string to be used as XML cdata. Borrowed from PHP
* manual comments on htmlentities()
*
* @see http://www.php.net/htmlentities
*
* @param string $str The string to escape
* @return string The escaped string
*/
function xmlEscape($str)
{
static $trans;
if (!isset($trans)) {
$trans = get_html_translation_table(HTML_ENTITIES, ENT_QUOTES);
foreach ($trans as $key => $value)
$trans[$key] = '&#'.ord($key).';';
// dont translate the '&' in case it is part of &xxx;
$trans[chr(38)] = '&';
}
return preg_replace("/&(?![A-Za-z]{0,4}\w{2,3};|#[0-9]{2,4};)/","&" , strtr($str, $trans));
}
}
?>