protocol = $url['scheme'];
} else {
$this->protocol = "http";
}
$this->host = $url['host'];
if (substr($this->host, 0, 3) == 'www')
$this->withWWW = TRUE;
}
$this->url = $this->protocol.'://'.$this->host.'/';
$this->todo[] = $this->url;
$this->deadline = $deadline;
// debug('', 'Crawler created for host '.$this->host.' with protocol '.$this->protocol);
}
/**
* crawles all files that are in the todo list
* algorithm: breadth first search (former algorithm: dfs)
*/
function start() {
reset($this->todo);
while (($this->deadline == 0) || (($this->deadline - $this->microtime_float()) > 0)) {
$url = array_pop($this->todo);
if (is_null($url) || $url == '')
break;
$this->_getFilesForURL($url);
}
ksort($this->files);
reset($this->files);
return count($this->files);
}
function microtime_float() {
list ($usec, $sec) = explode(" ", microtime());
return ((float) $usec + (float) $sec);
}
function getTodo() {
return $this->todo;
}
function getFiles() {
return $this->files;
}
function getDone() {
return $this->visitedUrls;
}
function setTodo($todo) {
$this->todo = $todo;
}
function setFiles($files) {
if (is_array($files))
$this->files = $files;
}
function setDone($done) {
$this->done = $done;
}
function setDirectory($dir) {
$this->path = $dir;
}
/**
* returns number of files
*/
function size() {
return count(array_keys($this->files));
}
function hasFinished() {
return (count($this->todo) == 0);
}
/**
* returns TRUE when the current item is not the last item
* behaves like in java
*/
function hasNext() {
if ($this->size() > $this->cur_item)
return TRUE;
return FALSE;
}
/**
* returns the current item
* behaves like in java
*/
function getNext() {
if (count($this->keys) == 0)
$this->keys = array_keys($this->files);
if ($this->hasNext()) {
$tmp = $this->files[$this->keys[$this->cur_item]];
$this->cur_item++;
return $tmp;
}
return NULL;
}
/**
* adds list of links extracted from this file $url
*/
function _getFilesForURL($url) {
$this->visitedUrls[] = $url;
// debug($url, 'Scanning url');
// if allready in list of files, return
if (array_key_exists($url, $this->files)) {
// debug($url, "File already in list of files");
return;
}
// check for non local file links that refers to another host
if (!($this->_isLocal($url))) {
// debug($url, 'The url does not match the current host '.$this->host.', only relative links are allowed at the moment!');
return;
}
// fetch content for given url
$res = $this->_getURL($url);
// extract headers
$info = $this->_handleHeaders($res['header']);
$res = $res['content'];
if ($info['http_status'] >= '400' && $info['http_status'] < '499') {
// we have an error - webpage is not accessible, just leave it
return;
}
// if not allready in list of files, add it
if (!array_key_exists($url, $this->files) && $info['location'] == '') {
$info['file'] = $url;
$this->files[$url] = $info;
$this->fileCounter++;
// debug($url, 'Successful added url');
} elseif ($info['location'] == '') {
// debug($url, "File already in list of files");
return;
} else {
// debug($url, "Url is only a redirect (http 302)");
}
// check location tag (when got a 302 response from webserver)
$result = array ();
if ($info['location'] != '') {
$res = ' ';
} else {
info('Computing '.$url);
}
// remove html comments
$a_begin = 0;
while (TRUE) {
$a_begin = strpos($res, '', $a_begin +3);
if ($a_end === FALSE) break; // no comment end tag found, break
$a_end += 3;
$res = substr_replace($res, '', $a_begin, ($a_end - $a_begin));
}
// contribution by vvkov
// preg_match_all("/<[Aa][ \r\n\t]{1}[^>]*[Hh][Rr][Ee][Ff][^=]*=[ '\"\n\r\t]*([^ \"'>]+)[^>]*>/",$res ,$urls);
preg_match_all("/<[Aa][^>]*[Hh][Rr][Ee][Ff]=['\"]([^\"'>]+)[^>]*>/",$res ,$urls); // update by TK, 2005-07-27
$urls_count = count( $urls[1] );
if (preg_match("/<[Bb][Aa][Ss][Ee][^>]*[Hh][Rr][Ee][Ff]=['\"]([^\"'>]+)[^>]*>/", $res, $matches)) {
$this->base = $matches[1];
}
$ts_begin = $this->microtime_float();
while ((($ts_middle = ($this->microtime_float()-$ts_begin)) < PSNG_CRAWLER_MAX_GETFILE_TIME) && $urls_count > 0 ) {
$thisurl = trim(str_replace('&', '&', $urls[1][--$urls_count]));
if ($thisurl == '' || (strcasecmp(substr($thisurl, 0, strlen('javascript:')), 'javascript:') == 0)) continue;
// filter out links to fragment ids (same resource) - added mk/2005-11-13
if ('#' == $thisurl{0}) continue;
// debug('_'.$thisurl.'_','Extracted url');
$absUrl1 = $this->_absolute($thisurl, $url);
//debug('_'.$absUrl1.'_', 'After _absolute');
$absUrl2 = $this->_removeForbiddenKeys($absUrl1);
// remove "//"
$start = (strpos($absUrl2, '//') + 3);
$end = strpos($absUrl2, '?', $start);
if ($end === FALSE) $end = strlen($absUrl2);
$absUrl = substr($absUrl2, 0, $start).str_replace('//', '/', substr($absUrl2, $start, ($end - $start))).substr($absUrl2, $end);
//debug($absUrl, "Computed absUrl");
if ($this->_isLocal($absUrl)) {
$result[] = $absUrl;
}
// just break this loop when a timeout occurs
if (($this->deadline != 0) && (($this->deadline - $this->microtime_float()) < 0)) {
debug('', "global timeout");
break;
}
}
$result = array_unique($result);
foreach ($result as $id => $file) {
if (!in_array($file, $this->visitedUrls) && !array_key_exists($file, $this->files)) {
// check forbidden files
if ($this->checkFileName($file)) continue;
// check forbidden directories
if ($this->checkDirectoryName($file)) continue;
//debug($file, 'Adding URL to todo list');
// add file to todo list
array_push($this->todo, $file);
} // else: file already in list
}
return TRUE;
}
function _isLocal($givenURL) {
if (preg_match(',^(ftp://|mailto:|news:|javascript:|telnet:|callto:),i', $givenURL)) return FALSE;
$url = parse_url($givenURL);
$startDir = $this->host . $this->path;
$curentDir = $url["host"] . $url["path"];
$retproto = (substr($curentDir, 0, strlen($startDir)) == $startDir);
// debug if (!$retproto) echo ($url["host"] . $url["path"] . "!=" . $this->host . $this->path . "
");
return $retproto;
}
/**
* WAS: only allowed masking char: * (before and/or after search string)
*
* TODO check this with more data
*/
function checkFileName($filename) {
$filename = substr($filename, strrpos($filename, '/') + 1);
if (is_array($this->forbidden_files) && count($this->forbidden_files) > 0) {
foreach ($this->forbidden_files as $id => $file) {
if ($file == '') continue;
$pos = strpos($filename, $file);
/* $file_search = '';
if (!(($as = strpos($file, '*')) === FALSE)) {
$file_search = str_replace('*', '', $file);
if ($as == 0) $pos = @strpos($filename, $file_search, (strlen($filename)-strlen($file_search)));
if ($as == strlen($file_search)) $pos = (@strpos($filename, $file_search) != 0);
} else {
$pos = ($filename === $file);
}
*/
if ($pos === FALSE) continue;
return TRUE;
}
}
return FALSE;
}
function checkDirectoryName($directory) {
$directory = substr($directory, 0, strrpos($directory, '/') + 1); // with last "/"
if (is_array($this->forbidden_dir) && count($this->forbidden_dir) > 0) {
foreach ($this->forbidden_dir as $id => $dir) {
if ($dir == '') continue;
$pos = strpos($directory, $dir);
/* $dir_search = '';
if (!(($as = strpos($dir, '*')) === FALSE)) {
$dir_search = str_replace('*', '', $dir);
if ($as == 0) $pos = @strpos($directory, $dir_search, (strlen($directory)-strlen($dir_search)));
if ($as == strlen($dir_search)) $pos = (@strpos($directory, $dir_search) != 0);
} else {
$pos = ($directory === $dir);
}
*/ // echo "directory: $directory, dir: $dir, dir_search: $dir_search, pos: $pos
\n";
if ($pos === FALSE) continue;
return TRUE;
}
}
return FALSE;
}
function _handleHeaders($header) {
$res = array();
$res['http_status'] = '';
$res['lastmod'] = '';
$res['date'] = '';
$res['size'] = '';
$res['location'] = '';
// TODO what about http result? after 'HTTP/' => split(" " ...) => [1]
if (is_array($header)) {
foreach ($header as $key => $value) {
if ($key == '' && substr($value, 0, strlen('HTTP/'))) {
$s = split(" ", $value);
$res['http_status'] = $s[1];
} elseif ($key == "Last-Modified") {
$res['lastmod'] = strtotime(trim($value)); // no dynamic (php/other script) generated page
} elseif ($key == "Date") {
$res['date'] = strtotime(trim($value));
} elseif ($key == "Content-Length") {
$res['size'] = trim($value);
} elseif ($key == "Location") {
$res['location'] = trim($value);
} elseif ($key == 'Set-Cookie') {
$parts = explode(";", trim($value));
$cookie_name = '';
$cookie = array();
foreach ($parts as $id => $part) {
$p = explode('=', trim($part));
$cookie[$p[0]] = $p[1];
if ($p[0] != 'path' && $p[0] != 'path' && strpos($p[0], 'expires') === FALSE && $p[0] != 'domain') {
$cookie_name = $p[0];
}
}
/* echo "got cookie: ";
print_r($cookie);
echo "
\n";
*/ // add cookie if not already set
if (!isset($this->cookies[$cookie_name])) {
$this->cookies[$cookie_name] = $cookie;
$this->forbiddenKeys[] = $cookie_name;
}
} elseif ($key == "Pragma") {
$pragma = trim($value);
if ($pragma == "no-cache") { // handle non-cached files -> normaly dynamic created pages
if (!isset ($res['lastmod'])) $res['lastmod'] = $res['date'];
$res['changefreq'] = 'always';
}
}
}
}
if ($res['date'] != '' && $res['lastmod'] == '') $res['lastmod'] = $res['date'];
// debug($header, 'Header');
// debug($res, 'Extracted information from headers');
/*
echo "final cookies: ";
print_r($this->cookies);
echo "
\n";
*/
return $res;
}
function _removeForbiddenKeys($url) {
$paramsStart = strpos($url, '?');
if ($paramsStart !== FALSE) { // url has no parameters, don't search for keys
foreach ($this->forbiddenKeys as $id => $key) {
if ($key == '') continue; // empty key => ignore it
$start = strpos($url, $key, $paramsStart);
while ($start != FALSE) {
$end = strpos($url, '&', $start);
if ($end !== FALSE) {
$url = substr($url, 0, $start).substr($url, $end);
} else {
$url = substr($url, 0, $start);
}
$start = strpos($url, $key, $paramsStart);
} // else: does not contain key
}
}
// remove anchor links : beginning with # to the end of the url
// echo "$url
\n";
if (strpos($url, '#') !== FALSE) {
$url = substr($url, 0, strpos($url, '#'));
}
// remove empty & and ?
while (substr($url, strlen($url) - 1) == "&") {
$url = substr($url, 0, strlen($url) - 1);
}
while (substr($url, strlen($url) - 1) == "?") {
$url = substr($url, 0, strlen($url) - 1);
}
return $url;
}
function _getURL($urlString) {
$url = parse_url($urlString);
$url_scheme = isset($url['scheme']) ? $url['scheme'] : '';
$url_host = isset($url['host']) ? $url['host'] : '';
$url_port = isset($url['port']) ? $url['port'] : '';
$url_path = isset($url['path']) ? $url['path'] : '';
$url_path = str_replace(' ', '%20', $url_path); // replace spaces in url
$url_query = isset($url['query']) ? $url['query'] : '';
$cookie_string = '';
if (count($this->cookies) > 0) {
foreach ($this->cookies as $cookie_name => $cookie) {
// check path - dumb approach (only check if url contains cookie path)
if (strpos($urlString, $cookie['path'])) {
$cookie_string .= $cookie_name . '=' . $cookie[$cookie_name] . '; ';
}
}
if (strlen($cookie_string) > 0) {
$cookie_string = 'Cookie: ' . $cookie_string ."\r\n";
}
}
// echo "Sending cookie_string: $cookie_string
\n";
if ($url_port == '') {
if ($url_scheme == 'https') {
$url_port = "443";
} else {
$url_port = "80";
}
}
// debug($url, 'Parsed URL');
$fp = fsockopen($url_host, $url_port, $errno, $errstr, $this->timeout);
if ($fp === FALSE) {
debug($errstr, 'Could not open connection for '.$urlString.' (host: '.$url_host.', port:'.$url_port.'), Errornumber: '.$errno);
return array('header' => array(), 'content' => '');
}
$query_encoded = '';
if ($url_query != '') {
$query_encoded = '?';
foreach (split('&', $url_query) as $id => $quer) {
$v = split('=', $quer);
if ($v[1] != '') {
$query_encoded .= $v[0].'='.rawurlencode($v[1]).'&';
} else {
$query_encoded .= $v[0].'&';
}
}
$query_encoded = substr($query_encoded, 0, strlen($query_encoded) - 1);
$query_encoded = str_replace('%2B','+', $query_encoded);
}
$get = "GET ".$url_path.$query_encoded." HTTP/1.1\r\n";
$get .= "Host: ".$url_host."\r\n";
$get .= "User-Agent: Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.0; phpSitemapNG ".PSNG_VERSION.")\r\n";
$get .= "Referer: ".$url_scheme.'://'.$url_host.$url_path."\r\n";
$get .= $cookie_string;
$get .= "Connection: close\r\n\r\n";
debug(str_replace("\n", "
\n", $get), 'GET-Query');
socket_set_blocking($fp, TRUE);
fwrite($fp, $get);
$res = '';
$head_done = FALSE;
$ts_begin = $this->microtime_float();
// source for chunk-decoding: http://www.phpforum.de/archiv_13065_fsockopen@end@chunked@geht@nicht_anzeigen.html
// get headers
$currentHeader = '';
while ( '' != ($line=trim(fgets($fp, 1024))) ) {
if ( FALSE !== ($pos=strpos($line, ':')) ) {
$currentHeader = substr($line, 0, $pos);
$header[$currentHeader] = trim(substr($line, $pos+1));
} else {
@$header[$currentHeader] .= $line;
}
}
// check for chunk encoding
if (isset($header['Transfer-Encoding']) && $header['Transfer-Encoding'] == 'chunked') {
$chunk = hexdec(fgets($fp, 1024));
} else {
$chunk = -1;
}
// check file size
if (isset($header['Content-Length']) && $header['Content-Length'] > PSNG_CRAWLER_MAX_FILESIZE) {
info($size, "File size ". $header['Content-Length'] . " of ".$urlString." exceeds file size limit of ".PSNG_CRAWLER_MAX_FILESIZE." byte!");
fclose($fp);
return array('header' => $header, 'content' => '');
}
// get content
$res = '';
while ($chunk != 0 && !feof($fp)) {
// echo "chunking...
\n";
if ($chunk > 0){
$part = fread($fp, $chunk);
$chunk -= strlen($part);
$res .= $part;
if ($chunk == 0){
if (fgets($fp, 1024) != "\r\n") debug('Error in chunk-decoding');
$chunk = hexdec(fgets($fp, 1024));
}
} else {
$res .= fread($fp, 1024);
}
// handle local timeout for fetching file
if (($ts_middle = $this->microtime_float() - $ts_begin) > PSNG_CRAWLER_MAX_GETFILE_TIME) break;
// handle global timeout:
if (($this->deadline != 0) && (($this->deadline - $this->microtime_float()) < 0)) break;
}
fclose($fp);
return array('header' => $header, 'content' => $res);
}
// based from: http://www.php-faq.de/q/q-regexp-links-absolut.html
/**
* Purpose: turn a link $relative found in the resource $absolute
* (which must be a fully-qualified URI) into another fully-qualified
* ("absolute") URI.
* The $absolute parameter is assumed to contain a valid URI *without*
* a fragment ID part: no checks are done; $relative can be any kind of
* link found in this resource.
*
* Modified by Marjolein Katsma to support links with only a fragment id
* or with only GET parameters.
*/
/* function _absolute($relative, $absolute) {
// Link ist schon absolut
if (preg_match(',^(https?://|ftp://|mailto:|news:|javascript:|telnet:|callto:),i', $relative))
{
// hostname is not the same (with/without www) than the one used in the link
if (substr($relative, 0, 4) == 'http')
{
$url = parse_url($relative);
if ($url['host'] != $this->host
&& (
(("www.".$url['host']) == $this->host)
&& $this->withWWW == TRUE
|| ($url['host'] == ("www.".$this->host))
&& $this->withWWW == FALSE
)
)
{
$r = $relative; # @@@ not used mk/2005-11-13
$relative = str_replace($url['host'], $this->host, $relative); // replace hostname that differs from local
}
// is pure hostname without path - so add a /
if (!isset($url['path']) || ($url['path'] == '' && substr($relative, -1) != '/'))
{
$relative .= '/';
}
}
return $relative;
}
// parse_url() nimmt die URL auseinander
// @@@ does not take into account that parse_url() may return FALSE on error! mk/2005-11-13
$url = parse_url($absolute);
// dirname() erkennt auf / endende URLs nicht
if ($url['path'] {(strlen($url['path'])- 1)} == '/')
$dir = substr($url['path'], 0, strlen($url['path']) - 1);
else
$dir = dirname($url['path']);
// absoluter Link auf dem gleichen Server
if ($relative{0} == '/') {
$relative = substr($relative, 1);
$dir = '';
}
// set it to default host // TK
/* - assumed $url['host'] is set - not necessarily true for all schemes! condition added
* - corrected tests for return value of strpos (result 0 is a match!!)
* mk/2005-11-13
* /
if (isset($url['host']))
{
if ($url['host'] != $this->host &&
(strpos($url['host'], $this->host) !== FALSE || strpos($this->host, $url['host']) !== FALSE))
{
$url['host'] = $this->host;
}
}
/* GET-parameter links: replace any existing GET
* parameters or append to (sanitized) $absolute
* mk/2005-11-13
* /
if ('?' == $relative{0})
{
// prepare for building new URL
$query = $relative;
echo 'Crawler _absolute: '.'query '.$query.'
';
}
/* fragment-id links: should be appended to (sanitized) $absolute
* mk/2005-11-13
* /
elseif ('#' == $relative{0})
{
// prepare for building new URL
$fragment = $relative;
echo 'Crawler _absolute: '.'fragment '.$fragment.'
';
}
// other relative link: build a new path from current directory/path and $relative
else
{
// dirname() erkennt auf / endende URLs nicht
// assumes $url['path'] is set - not necessarily true! condition added mk/2005-11-13
if (isset($url['path']))
{
if ('/' == substr($url['path'], -1))
{
$dir = substr($url['path'], 0, strlen($url['path']) - 1);
echo 'Crawler _absolute: '.'path '.$url['path'].' ends in / - dir: '.$dir.'
';
}
else
{
$dir = dirname($url['path']);
echo 'Crawler _absolute: '.'path '.$url['path'].' does NOT end in / - dir: '.$dir.'
';
}
}
else
{
$dir = '/'; # minimal dir to use in URL path
}
// absoluter Link auf dem gleichen Server == absolute link to same server/host
# @@@ mk/2005-11-13 no / between host and relative??
if ($relative{0} == '/') {
echo 'Crawler _absolute: '.'absolute link to '.$relative.'
';
$relative = substr($relative, 1);
$dir = '/';
} else {
// Link fängt mit ./ an
if (substr($relative, 0, 2) == './')
{
$relative = substr($relative, 2);
}
// Referenzen auf höher liegende Verzeichnisse auflösen
else
{
while (substr($relative, 0, 3) == '../') {
$relative = substr($relative, 3);
$dir = substr($dir, 0, strrpos($dir, '/'));
}
}
}
// now construct new path mk/2005-11-13
$path = $dir.$relative;
echo 'Crawler _absolute: '.'new path '.$path.'
';
}
// volle URL zurückgeben
// did not support all parts or a URL! - corrected mk/2005-11-13
$abs = ('file' == $url['scheme']) ? $url['scheme'].':///' : $url['scheme'].'://';
$abs .= (isset($url['user'])) ? $abs .= $url['user'].( (isset($url['pass'])) ? ':'.$url['pass'] : '' ).'@' : '';
$abs .= (isset($url['host'])) ? $url['host'] : '';
$abs .= (isset($url['port'])) ? ':'.$url['port'] : '';
$abs .= (isset($path)) ? $path : (isset($url['path']) ? $url['path'] : '/'); # maintain existing path if we didn't build a new one; make sure we have at least a '/'
$abs .= (isset($query)) ? $query : ''; # append specified query link
$abs .= (isset($fragment)) ? $fragment : ''; # append specified fragment link
//mecho 'Crawler _absolute: '.'new url '.$abs.'
';
return $abs;
}
*/
function _absolute($relative, $absolute) {
// Link ist schon absolut
if (preg_match(',^(https?://|ftp://|mailto:|news:|javascript:|telnet:|callto:),i', $relative)) {
// hostname is not the same (with/without www) than the one used in the link
if (substr($relative, 0, 4) == 'http') {
$url = parse_url($relative);
if ($url['host'] != $this->host && ((("www.".$url['host']) == $this->host) && $this->withWWW == true || ($url['host'] == ("www.".$this->host)) && $this->withWWW == false)) {
$r = $relative;
$relative = str_replace($url['host'], $this->host, $relative); // replace hostname that differes from local
}
// is pure hostname without path - so add a /
if (!array_key_exists('path', $url) || $url['path'] == '' && substr($relative, -1) != '/')
$relative .= '/';
}
return $relative;
}
// parse_url() nimmt die URL auseinander
$url = parse_url($absolute);
// dirname() erkennt auf / endende URLs nicht
if ($url['path'] { strlen($url['path']) - 1 } == '/')
$dir = substr($url['path'], 0, strlen($url['path']) - 1);
else
$dir = dirname($url['path']);
// absoluter Link auf dem gleichen Server
if ($relative{0} == '/') {
$relative = substr($relative, 1);
$dir = '';
}
// set it to default host // TK
if ($url['host'] != $this->host && (strpos($url['host'], $this->host) != FALSE || strpos($this->host, $url['host']) != FALSE)) {
$url['host'] = $this->host;
}
// Link fängt mit ./ an
if (substr($relative, 0, 2) == './')
$relative = substr($relative, 2);
// Referenzen auf höher liegende Verzeichnisse auflösen
else
while (substr($relative, 0, 3) == '../') {
$relative = substr($relative, 3);
$dir = substr($dir, 0, strrpos($dir, '/'));
}
// if base is set, add it.
if (strlen($this->base)) {
return $this->base . urldecode($relative);
}
// volle URL zurückgeben
return sprintf('%s://%s%s/%s', $url['scheme'], $url['host'], $dir, urldecode($relative));
}
/* better compare function: contains */
function _fl_contains($key, $array) {
if (is_array($array) && count($array) > 0) {
foreach ($array as $id => $val) {
$pos = @ strpos($key, $val);
if ($pos === FALSE) continue;
return TRUE;
}
}
return FALSE;
}
/**
* set list of forbidden directories
*/
function setForbiddenDirectories($directories = array ()) {
$this->forbidden_dir = $directories;
}
/**
* set list of forbidden files
*/
function setForbiddenFiles($files = array ()) {
$this->forbidden_files = $files;
}
function setForbiddenKeys($keys) {
$this->forbiddenKeys = $keys;
// if(!in_array($key, $this->forbiddenKeys)) $this->forbiddenKeys[] = $key;
}
}
?>