Convert spider detection to datastructure and loop
[infodrom.org/www.zeitungsliste.de] / lib / core.inc
index 0365114..b802c39 100644 (file)
@@ -122,102 +122,118 @@ function is_spider()
     if (isset($_SESSION['robot']))
       return $_SESSION['robot'];
 
     if (isset($_SESSION['robot']))
       return $_SESSION['robot'];
 
-  if (strpos($_SERVER['HTTP_USER_AGENT'], 'Yahoo! Slurp') !== false ||
-      strpos($_SERVER['HTTP_USER_AGENT'], 'Googlebot') !== false ||
-      strpos($_SERVER['HTTP_USER_AGENT'], 'Mediapartners-Google') !== false ||
-      strpos($_SERVER['HTTP_USER_AGENT'], 'VoilaBot') !== false ||
-      strpos($_SERVER['HTTP_USER_AGENT'], 'Gigabot/3.0') !== false ||
-      strpos($_SERVER['HTTP_USER_AGENT'], 'Speedy Spider') !== false ||
-      strpos($_SERVER['HTTP_USER_AGENT'], 'LinkWalker/2.0') !== false ||
-      strpos($_SERVER['HTTP_USER_AGENT'], 'proximic') !== false ||
-      strpos($_SERVER['HTTP_USER_AGENT'], 'Yeti/1.0') !== false ||
-      strpos($_SERVER['HTTP_USER_AGENT'], 'Eurobot/1.0') !== false ||
-      strpos($_SERVER['HTTP_USER_AGENT'], 'MnoGoSearch/') !== false ||
-      strpos($_SERVER['HTTP_USER_AGENT'], 'ia_archiver') !== false ||
-      strpos($_SERVER['HTTP_USER_AGENT'], 'Seekbot/1.0') !== false ||
-      strpos($_SERVER['HTTP_USER_AGENT'], 'MyEngines-Bot') !== false ||
-      strpos($_SERVER['HTTP_USER_AGENT'], 'larbin_') !== false ||
-      strpos($_SERVER['HTTP_USER_AGENT'], 'findlinks/1') !== false ||
-      strpos($_SERVER['HTTP_USER_AGENT'], 'holmes/3.12') !== false ||
-      strpos($_SERVER['HTTP_USER_AGENT'], 'NoteworthyBot/0.1') !== false ||
-      strpos($_SERVER['HTTP_USER_AGENT'], 'Eurosoft-Bot') !== false ||
-      strpos($_SERVER['HTTP_USER_AGENT'], 'Eurobot/1') !== false ||
-      strpos($_SERVER['HTTP_USER_AGENT'], 'Yandex') !== false ||
-      strpos($_SERVER['HTTP_USER_AGENT'], 'msnbot') !== false ||
-      strpos($_SERVER['HTTP_USER_AGENT'], 'Twiceler-0.9') !== false ||
-      strpos($_SERVER['HTTP_USER_AGENT'], 'ScoutJet') !== false ||
-      strpos($_SERVER['HTTP_USER_AGENT'], 'penthesilea/0.3') !== false ||
-      strpos($_SERVER['HTTP_USER_AGENT'], 'VisBot/2.0') !== false ||
-      strpos($_SERVER['HTTP_USER_AGENT'], 'Netluchs/Nutch-1.0') !== false ||
-      strpos($_SERVER['HTTP_USER_AGENT'], 'Yanga WorldSearch Bot') !== false ||
-      strpos($_SERVER['HTTP_USER_AGENT'], 'Semager') !== false ||
-      strpos($_SERVER['HTTP_USER_AGENT'], 'DotBot') !== false ||
-      strpos($_SERVER['HTTP_USER_AGENT'], 'Baiduspider') !== false ||
-      strpos($_SERVER['HTTP_USER_AGENT'], 'MJ12bot/') !== false ||
-      strpos($_SERVER['HTTP_USER_AGENT'], 'Yahoo-MMCrawler') !== false ||
-      strpos($_SERVER['HTTP_USER_AGENT'], 'Charlotte/1.1') !== false ||
-      strpos($_SERVER['HTTP_USER_AGENT'], 'Gaisbot/3.0') !== false ||
-      strpos($_SERVER['HTTP_USER_AGENT'], 'psbot/0.1') !== false ||
-      strpos($_SERVER['HTTP_USER_AGENT'], 'Ask Jeeves/Teoma') !== false ||
-      strpos($_SERVER['HTTP_USER_AGENT'], 'SapphireWebCrawler/Nutch') !== false ||
-      strpos($_SERVER['HTTP_USER_AGENT'], 'kalooga/KaloogaBot') !== false ||
-      strpos($_SERVER['HTTP_USER_AGENT'], 'SimilarPages/Nutch-1.0-dev') !== false ||
-      strpos($_SERVER['HTTP_USER_AGENT'], 'Cogentbot/1.') !== false ||
-      strpos($_SERVER['HTTP_USER_AGENT'], 'CatchBot/1.0') !== false ||
-      strpos($_SERVER['HTTP_USER_AGENT'], 'GingerCrawler/1.0') !== false ||
-      strpos($_SERVER['HTTP_USER_AGENT'], 'yacybot') !== false ||
-      strpos($_SERVER['HTTP_USER_AGENT'], 'www.yacy.net') !== false ||
-      strpos($_SERVER['HTTP_USER_AGENT'], 'Nutch-1.0') !== false ||
-      strpos($_SERVER['HTTP_USER_AGENT'], 'Tagoobot/3.0') !== false ||
-      strpos($_SERVER['HTTP_USER_AGENT'], 'SapphireWebCrawler/1.0') !== false ||
-      strpos($_SERVER['HTTP_USER_AGENT'], 'BotOnParade') !== false ||
-      strpos($_SERVER['HTTP_USER_AGENT'], 'SurveyBot/2.3') !== false ||
-      strpos($_SERVER['HTTP_USER_AGENT'], 'XmarksFetch/1.0') !== false ||
-      strpos($_SERVER['HTTP_USER_AGENT'], 'spbot/2.0') !== false ||
-      strpos($_SERVER['HTTP_USER_AGENT'], 'TinEye/1.1') !== false ||
-      strpos($_SERVER['HTTP_USER_AGENT'], 'TurnitinBot/2.1') !== false ||
-      strpos($_SERVER['HTTP_USER_AGENT'], 'CligooRobot/1.0') !== false ||
-      strpos($_SERVER['HTTP_USER_AGENT'], 'libwww-perl') !== false ||
-      strpos($_SERVER['HTTP_USER_AGENT'], 'Wget') !== false ||
-      strpos($_SERVER['HTTP_USER_AGENT'], 'Python-urllib') !== false ||
-      strpos($_SERVER['HTTP_USER_AGENT'], 'CamontSpider/1.0') !== false ||
-      strpos($_SERVER['HTTP_USER_AGENT'], 'TwengaBot-Discover') !== false ||
-      strpos($_SERVER['HTTP_USER_AGENT'], 'Hailoobot/1.2') !== false ||
-      strpos($_SERVER['HTTP_USER_AGENT'], 'bingbot/2.0') !== false ||
-      strpos($_SERVER['HTTP_USER_AGENT'], 'Exabot/3.0') !== false ||
-      strpos($_SERVER['HTTP_USER_AGENT'], 'spbot/2.1') !== false ||
-      strpos($_SERVER['HTTP_USER_AGENT'], 'Search17Bot/1') !== false ||
-      strpos($_SERVER['HTTP_USER_AGENT'], 'Linguee Bot') !== false ||
-      strpos($_SERVER['HTTP_USER_AGENT'], 'findlinks/2') !== false ||
-      strpos($_SERVER['HTTP_USER_AGENT'], 'SiteBot/0.1') !== false ||
-      strpos($_SERVER['HTTP_USER_AGENT'], 'Purebot/1.1') !== false ||
-      strpos($_SERVER['HTTP_USER_AGENT'], 'Ezooms/1.0') !== false ||
-      strpos($_SERVER['HTTP_USER_AGENT'], 'discobot/1.1') !== false ||
-      strpos($_SERVER['HTTP_USER_AGENT'], 'AhrefsBot/1.0') !== false ||
-      strpos($_SERVER['HTTP_USER_AGENT'], 'suggybot v0.01a') !== false ||
-      strpos($_SERVER['HTTP_USER_AGENT'], 'DomainCrawler/2.0') !== false ||
-      strpos($_SERVER['HTTP_USER_AGENT'], 'Plukkie/1') !== false ||
-      strpos($_SERVER['HTTP_USER_AGENT'], '/Nutch-') !== false ||
-      strpos($_SERVER['HTTP_USER_AGENT'], 'aiHitBot/1.0') !== false ||
-      strpos($_SERVER['HTTP_USER_AGENT'], 'AhrefsBot/') !== false ||
-      strpos($_SERVER['HTTP_USER_AGENT'], 'Crawler') !== false ||
-      strpos($_SERVER['HTTP_USER_AGENT'], 'Acoon') !== false ||
-      strpos($_SERVER['HTTP_USER_AGENT'], 'aiHitBot') !== false ||
-      strpos($_SERVER['HTTP_USER_AGENT'], 'SeznamBot') !== false ||
-      strpos($_SERVER['HTTP_USER_AGENT'], 'CareerBot/1') !== false ||
-      strpos($_SERVER['HTTP_USER_AGENT'], 'SiteExplorer/1') !== false ||
-      strpos($_SERVER['HTTP_USER_AGENT'], 'discoverybot/2.0') !== false ||
-      strpos($_SERVER['HTTP_USER_AGENT'], 'SemrushBot/') !== false ||
-      strpos($_SERVER['HTTP_USER_AGENT'], 'archive.org_bot') !== false ||
-      strpos($_SERVER['HTTP_USER_AGENT'], 'waybackarchive.org/1.0') !== false ||
-      strpos($_SERVER['HTTP_USER_AGENT'], 'spbot/4.0') !== false ||
-      strpos($_SERVER['HTTP_USER_AGENT'], 'Mail.RU_Bot/2.0') !== false ||
-      strpos($_SERVER['HTTP_USER_AGENT'], 'linkdexbot/2.0') !== false ||
-      strpos($_SERVER['HTTP_USER_AGENT'], 'webmeup-crawler.com') !== false ||
-      strpos($_SERVER['HTTP_USER_AGENT'], 'Cityreview Robot') !== false) {
-    if (isset($_SESSION['uid']))
-      $_SESSION['robot'] = true;
-    return true;
+  $known_spiders = array('Yahoo! Slurp',
+                        'Googlebot',
+                        'Mediapartners-Google',
+                        'VoilaBot',
+                        'Gigabot/3.0',
+                        'Speedy Spider',
+                        'LinkWalker/2.0',
+                        'proximic',
+                        'Yeti/1.0',
+                        'Eurobot/1.0',
+                        'MnoGoSearch/',
+                        'ia_archiver',
+                        'Seekbot/1.0',
+                        'MyEngines-Bot',
+                        'larbin_',
+                        'findlinks/1',
+                        'holmes/3.12',
+                        'NoteworthyBot/0.1',
+                        'Eurosoft-Bot',
+                        'Eurobot/1',
+                        'Yandex',
+                        'msnbot',
+                        'Twiceler-0.9',
+                        'ScoutJet',
+                        'penthesilea/0.3',
+                        'VisBot/2.0',
+                        'Netluchs/Nutch-1.0',
+                        'Yanga WorldSearch Bot',
+                        'Semager',
+                        'DotBot',
+                        'Baiduspider',
+                        'MJ12bot/',
+                        'Yahoo-MMCrawler',
+                        'Charlotte/1.1',
+                        'Gaisbot/3.0',
+                        'psbot/0.1',
+                        'Ask Jeeves/Teoma',
+                        'SapphireWebCrawler/Nutch',
+                        'kalooga/KaloogaBot',
+                        'SimilarPages/Nutch-1.0-dev',
+                        'Cogentbot/1.',
+                        'CatchBot/1.0',
+                        'GingerCrawler/1.0',
+                        'yacybot',
+                        'www.yacy.net',
+                        'Nutch-1.0',
+                        'Tagoobot/3.0',
+                        'SapphireWebCrawler/1.0',
+                        'BotOnParade',
+                        'SurveyBot/2.3',
+                        'XmarksFetch/1.0',
+                        'spbot/',
+                        'TinEye/1.1',
+                        'TurnitinBot/2.1',
+                        'CligooRobot/1.0',
+                        'libwww-perl',
+                        'Wget',
+                        'Python-urllib',
+                        'CamontSpider/1.0',
+                        'TwengaBot-Discover',
+                        'Hailoobot/1.2',
+                        'bingbot/2.0',
+                        'Exabot/3.0',
+                        'spbot/2.1',
+                        'Search17Bot/1',
+                        'Linguee Bot',
+                        'findlinks/2',
+                        'SiteBot/0.1',
+                        'Purebot/1.1',
+                        'Ezooms/1.0',
+                        'discobot/1.1',
+                        'AhrefsBot/1.0',
+                        'suggybot v0.01a',
+                        'DomainCrawler/2.0',
+                        'Plukkie/1',
+                        '/Nutch-',
+                        'aiHitBot/1.0',
+                        'AhrefsBot/',
+                        'Crawler',
+                        'Acoon',
+                        'aiHitBot',
+                        'SeznamBot',
+                        'CareerBot/1',
+                        'SiteExplorer/1',
+                        'discoverybot/2.0',
+                        'SemrushBot/',
+                        'archive.org_bot',
+                        'waybackarchive.org/1.0',
+                        'spbot/4.0',
+                        'Mail.RU_Bot/2.0',
+                        'linkdexbot/2.0',
+                        'webmeup-crawler.com',
+                        'meanpathbot/',
+                        'SearchmetricsBot',
+                        'publiclibraryarchive.org/',
+                        'memoryBot/1',
+                        'Lipperhey SEO Service',
+                        'Sitedomain-Bot',
+                        'seoscanners.net/',
+                        'Qwantify/',
+                        'WBSearchBot/',
+                        'Seobility',
+                        'MetaJobBot',
+                        'BellPagesCA/1.0',
+                        'Cityreview Robot');
+
+  foreach ($known_spiders as $spider_id) {
+    if (strpos($_SERVER['HTTP_USER_AGENT'], $spider_id) !== false) {
+      if (isset($_SESSION['uid']))
+       $_SESSION['robot'] = true;
+      return true;
+    }
   }
 
   if (isset($_SESSION['uid']))
   }
 
   if (isset($_SESSION['uid']))