Blame view

sm/spider.func.php 2.08 KB
42868d70   andryeyev   Создал GIT
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
  <?php
  // PHP xml sitemap generator
  
  
  function spider() {
          global $CONFIG, $SPIDER;
          $protocol = "http://";
          $tempt = "spider_temp_".$SPIDER["tid"];
          $spidert = "spider_".$SPIDER["tid"];
  
          $SPIDER["temp"][0] = $CONFIG["baseurl"];
          $SPIDER["baseurl"] = $CONFIG["baseurl"];
  
          while(sizeof($SPIDER["temp"]) > 0) {
                  for($i = 0; $i < 4 && $i < sizeof($SPIDER["temp"]); $i++) $urls[] = array_pop($SPIDER["temp"]);
                  multiGetURL($urls);
          }
          $fp = fopen($CONFIG["sitemap_file"], "w+");
          $xml_sitemap = genXmlSitemap();
          fputs($fp, $xml_sitemap);
          fclose($fp);
  
  }
  
  function handleHref($html, $href, $url) {
          global $SPIDER;
          $url_info = parse_url($href);
          if($url_info["scheme"] == "javascript") {
                  return false;
          }
          if($url_info["scheme"] == "http") {
                  if(!urlInSpider($href)) {
                          if(!isLinkExternal($href, $SPIDER["baseurl"])) {
                                  if(!urlInTemp($href))
                                          $SPIDER["temp"][] = $href;
                          }
                  }
          }
  }
  
  function getAnchors($url, $html) {
          global $SPIDER;
          $anchors = getTags($html, '<a', '>');
          for($i = 0; $i < sizeof($anchors); $i++) {
                  $href = getTagField($anchors[$i], "href=");
                  $href = correctUrl($href, $SPIDER["baseurl"]);
                  handleHref($html, $href, $url);
          }
  }
  
  function multiGetURL($urls) {
          global $SPIDER;
          $htmls = curlMultiGetPage($urls);
          for($i = 0; $i < sizeof($urls); $i++) {
                  echo "Checking ".$urls[$i]." ...\n";
                  if($htmls[$i] != "") {
                          $SPIDER["spider"][] = $urls[$i];
                          getAnchors($urls[$i], $htmls[$i]);
                  }
          }
  }
  
  function urlInSpider($url) {
          global $SPIDER;
          return in_array($url, $SPIDER["spider"]);
  }
  
  function urlInTemp($url) {
          global $SPIDER;
          return in_array($url, $SPIDER["temp"]);
  }
  
  ?>