Update list of crawlers
[infodrom.org/www.zeitungsliste.de] / lib / core.inc
index 1242912..6d6be04 100644 (file)
@@ -72,7 +72,6 @@ function string_sanitise($text)
 function basepath()
 {
   global $cfg;
-  global $_SERVER;
 
   $pos = strpos(strtolower($cfg['home']), strtolower($_SERVER['SERVER_NAME']));
   $base = substr($cfg['home'], $pos+strlen($_SERVER['SERVER_NAME']));
@@ -95,15 +94,11 @@ function basepath()
 
 function logged_in()
 {
-  global $_SESSION;
-
   return isset($_SESSION['uid']);
 }
 
 function javascript_ok()
 {
-  global $_SESSION;
-
   if (!logged_in())
     return true;
 
@@ -115,67 +110,138 @@ function javascript_ok()
 
 function is_spider()
 {
-  global $_SERVER;
-  global $_SESSION;
-
   if (isset($_SESSION['uid']))
     if (isset($_SESSION['robot']))
       return $_SESSION['robot'];
 
-  if (strpos($_SERVER['HTTP_USER_AGENT'], 'Yahoo! Slurp') !== false ||
-      strpos($_SERVER['HTTP_USER_AGENT'], 'Googlebot') !== false ||
-      strpos($_SERVER['HTTP_USER_AGENT'], 'Mediapartners-Google') !== false ||
-      strpos($_SERVER['HTTP_USER_AGENT'], 'VoilaBot') !== false ||
-      strpos($_SERVER['HTTP_USER_AGENT'], 'Gigabot/3.0') !== false ||
-      strpos($_SERVER['HTTP_USER_AGENT'], 'Speedy Spider') !== false ||
-      strpos($_SERVER['HTTP_USER_AGENT'], 'LinkWalker/2.0') !== false ||
-      strpos($_SERVER['HTTP_USER_AGENT'], 'proximic') !== false ||
-      strpos($_SERVER['HTTP_USER_AGENT'], 'Yeti/1.0') !== false ||
-      strpos($_SERVER['HTTP_USER_AGENT'], 'Eurobot/1.0') !== false ||
-      strpos($_SERVER['HTTP_USER_AGENT'], 'MnoGoSearch/') !== false ||
-      strpos($_SERVER['HTTP_USER_AGENT'], 'ia_archiver') !== false ||
-      strpos($_SERVER['HTTP_USER_AGENT'], 'Seekbot/1.0') !== false ||
-      strpos($_SERVER['HTTP_USER_AGENT'], 'MyEngines-Bot') !== false ||
-      strpos($_SERVER['HTTP_USER_AGENT'], 'larbin_') !== false ||
-      strpos($_SERVER['HTTP_USER_AGENT'], 'findlinks/1') !== false ||
-      strpos($_SERVER['HTTP_USER_AGENT'], 'holmes/3.12') !== false ||
-      strpos($_SERVER['HTTP_USER_AGENT'], 'NoteworthyBot/0.1') !== false ||
-      strpos($_SERVER['HTTP_USER_AGENT'], 'Eurosoft-Bot') !== false ||
-      strpos($_SERVER['HTTP_USER_AGENT'], 'Eurobot/1.1') !== false ||
-      strpos($_SERVER['HTTP_USER_AGENT'], 'Yandex') !== false ||
-      strpos($_SERVER['HTTP_USER_AGENT'], 'msnbot') !== false ||
-      strpos($_SERVER['HTTP_USER_AGENT'], 'Twiceler-0.9') !== false ||
-      strpos($_SERVER['HTTP_USER_AGENT'], 'ScoutJet') !== false ||
-      strpos($_SERVER['HTTP_USER_AGENT'], 'penthesilea/0.3') !== false ||
-      strpos($_SERVER['HTTP_USER_AGENT'], 'VisBot/2.0') !== false ||
-      strpos($_SERVER['HTTP_USER_AGENT'], 'Netluchs/Nutch-1.0') !== false ||
-      strpos($_SERVER['HTTP_USER_AGENT'], 'Yanga WorldSearch Bot') !== false ||
-      strpos($_SERVER['HTTP_USER_AGENT'], 'Semager') !== false ||
-      strpos($_SERVER['HTTP_USER_AGENT'], 'DotBot') !== false ||
-      strpos($_SERVER['HTTP_USER_AGENT'], 'Baiduspider') !== false ||
-      strpos($_SERVER['HTTP_USER_AGENT'], 'MJ12bot/') !== false ||
-      strpos($_SERVER['HTTP_USER_AGENT'], 'Yahoo-MMCrawler') !== false ||
-      strpos($_SERVER['HTTP_USER_AGENT'], 'Charlotte/1.1') !== false ||
-      strpos($_SERVER['HTTP_USER_AGENT'], 'Gaisbot/3.0') !== false ||
-      strpos($_SERVER['HTTP_USER_AGENT'], 'psbot/0.1') !== false ||
-      strpos($_SERVER['HTTP_USER_AGENT'], 'Ask Jeeves/Teoma') !== false ||
-      strpos($_SERVER['HTTP_USER_AGENT'], 'SapphireWebCrawler/Nutch') !== false ||
-      strpos($_SERVER['HTTP_USER_AGENT'], 'kalooga/KaloogaBot') !== false ||
-      strpos($_SERVER['HTTP_USER_AGENT'], 'SimilarPages/Nutch-1.0-dev') !== false ||
-      strpos($_SERVER['HTTP_USER_AGENT'], 'Cogentbot/1.') !== false ||
-      strpos($_SERVER['HTTP_USER_AGENT'], 'CatchBot/1.0') !== false ||
-      strpos($_SERVER['HTTP_USER_AGENT'], 'GingerCrawler/1.0') !== false ||
-      strpos($_SERVER['HTTP_USER_AGENT'], 'yacybot') !== false ||
-      strpos($_SERVER['HTTP_USER_AGENT'], 'www.yacy.net') !== false ||
-      strpos($_SERVER['HTTP_USER_AGENT'], 'Nutch-1.0') !== false ||
-      strpos($_SERVER['HTTP_USER_AGENT'], 'Tagoobot/3.0') !== false ||
-      strpos($_SERVER['HTTP_USER_AGENT'], 'SapphireWebCrawler/1.0') !== false ||
-      strpos($_SERVER['HTTP_USER_AGENT'], 'BotOnParade') !== false ||
-      strpos($_SERVER['HTTP_USER_AGENT'], 'SurveyBot/2.3') !== false ||
-      strpos($_SERVER['HTTP_USER_AGENT'], 'Cityreview Robot') !== false) {
-    if (isset($_SESSION['uid']))
-      $_SESSION['robot'] = true;
-    return true;
+  $known_spiders = array('Yahoo! Slurp',
+                        'Googlebot',
+                        'Mediapartners-Google',
+                        'VoilaBot',
+                        'Gigabot/3.0',
+                        'Speedy Spider',
+                        'LinkWalker/2.0',
+                        'proximic',
+                        'Yeti/1.0',
+                        'Eurobot/1.0',
+                        'MnoGoSearch/',
+                        'ia_archiver',
+                        'Seekbot/1.0',
+                        'MyEngines-Bot',
+                        'larbin_',
+                        'findlinks/1',
+                        'holmes/3.12',
+                        'NoteworthyBot/0.1',
+                        'Eurosoft-Bot',
+                        'Eurobot/1',
+                        'Yandex',
+                        'msnbot',
+                        'Twiceler-0.9',
+                        'ScoutJet',
+                        'penthesilea/0.3',
+                        'VisBot/2.0',
+                        'Netluchs/Nutch-1.0',
+                        'Yanga WorldSearch Bot',
+                        'Semager',
+                        'DotBot',
+                        'Baiduspider',
+                        'MJ12bot/',
+                        'Yahoo-MMCrawler',
+                        'Charlotte/1.1',
+                        'Gaisbot/3.0',
+                        'psbot/0.1',
+                        'Ask Jeeves/Teoma',
+                        'SapphireWebCrawler/Nutch',
+                        'kalooga/KaloogaBot',
+                        'SimilarPages/Nutch-1.0-dev',
+                        'Cogentbot/1.',
+                        'CatchBot/1.0',
+                        'GingerCrawler/1.0',
+                        'yacybot',
+                        'www.yacy.net',
+                        'Nutch-1.0',
+                        'Tagoobot/3.0',
+                        'SapphireWebCrawler/1.0',
+                        'BotOnParade',
+                        'SurveyBot/2.3',
+                        'XmarksFetch/1.0',
+                        'spbot/',
+                        'TinEye/1.1',
+                        'TurnitinBot/2.1',
+                        'CligooRobot/1.0',
+                        'libwww-perl',
+                        'Wget',
+                        'Python-urllib',
+                        'CamontSpider/1.0',
+                        'TwengaBot-Discover',
+                        'Hailoobot/1.2',
+                        'bingbot/2.0',
+                        'Exabot/3.0',
+                        'spbot/2.1',
+                        'Search17Bot/1',
+                        'Linguee Bot',
+                        'findlinks/2',
+                        'SiteBot/0.1',
+                        'Purebot/1.1',
+                        'Ezooms/1.0',
+                        'discobot/1.1',
+                        'AhrefsBot/1.0',
+                        'suggybot v0.01a',
+                        'DomainCrawler/2.0',
+                        'Plukkie/1',
+                        '/Nutch-',
+                        'aiHitBot/1.0',
+                        'AhrefsBot/',
+                        'Crawler',
+                        'Acoon',
+                        'aiHitBot',
+                        'SeznamBot',
+                        'CareerBot/1',
+                        'SiteExplorer/1',
+                        'discoverybot/2.0',
+                        'SemrushBot/',
+                        'archive.org_bot',
+                        'waybackarchive.org/1.0',
+                        'spbot/4.0',
+                        'Mail.RU_Bot/2.0',
+                        'linkdexbot/2.0',
+                        'webmeup-crawler.com',
+                        'meanpathbot/',
+                        'SearchmetricsBot',
+                        'publiclibraryarchive.org/',
+                        'memoryBot/1',
+                        'Lipperhey SEO Service',
+                        'Sitedomain-Bot',
+                        'seoscanners.net/',
+                        'Qwantify/',
+                        'WBSearchBot/',
+                        'Seobility',
+                        'MetaJobBot',
+                        'BellPagesCA/1.0',
+                        'SafeDNSBot',
+                        'Uptimebot/1.0',
+                        'DomainStatsBot/1.0',
+                        'MetaJobBot',
+                        'SEOkicks-Robot',
+                        'Cliqzbot',
+                        'bingbot/3',
+                        'SEOkicks-Robot',
+                        'MojeekBot/',
+                        'oBot/',
+                        'CCBot/',
+                        'TurnitinBot',
+                        'ips-agent',
+                        'ExtLinksBot/',
+                        'Barkrowler/',
+                        'adscanner/',
+                        'Cityreview Robot');
+
+  foreach ($known_spiders as $spider_id) {
+    if (strpos($_SERVER['HTTP_USER_AGENT'], $spider_id) !== false) {
+      if (isset($_SESSION['uid']))
+       $_SESSION['robot'] = true;
+      return true;
+    }
   }
 
   if (isset($_SESSION['uid']))
@@ -185,7 +251,7 @@ function is_spider()
 
 function is_admin()
 {
-  global $_SESSION;
+  if (is_null($_SESSION)) return false;
 
   if (!array_key_exists('nickname', $_SESSION))
     return false;
@@ -196,7 +262,6 @@ function is_admin()
 
 function format_info_bookmarks()
 {
-  global $_SESSION;
   global $cfg;
 
   if (!isset($_SESSION['uid']))
@@ -223,5 +288,3 @@ function format_info_bookmarks()
 
   return $ret;
 }
-
-?>
\ No newline at end of file