From: Joey Schulze Date: Mon, 14 Mar 2016 08:12:48 +0000 (+0000) Subject: Convert spider detection to datastructure and loop X-Git-Url: https://git.infodrom.org/?p=infodrom.org%2Fwww.zeitungsliste.de;a=commitdiff_plain;h=a6c8f0acaea6fee2a140218d75a8478950cfd584;hp=89c4a80bd9600c3f1fdd068c512f90ba04b9a36b Convert spider detection to datastructure and loop --- diff --git a/lib/core.inc b/lib/core.inc index 0365114..b802c39 100644 --- a/lib/core.inc +++ b/lib/core.inc @@ -122,102 +122,118 @@ function is_spider() if (isset($_SESSION['robot'])) return $_SESSION['robot']; - if (strpos($_SERVER['HTTP_USER_AGENT'], 'Yahoo! Slurp') !== false || - strpos($_SERVER['HTTP_USER_AGENT'], 'Googlebot') !== false || - strpos($_SERVER['HTTP_USER_AGENT'], 'Mediapartners-Google') !== false || - strpos($_SERVER['HTTP_USER_AGENT'], 'VoilaBot') !== false || - strpos($_SERVER['HTTP_USER_AGENT'], 'Gigabot/3.0') !== false || - strpos($_SERVER['HTTP_USER_AGENT'], 'Speedy Spider') !== false || - strpos($_SERVER['HTTP_USER_AGENT'], 'LinkWalker/2.0') !== false || - strpos($_SERVER['HTTP_USER_AGENT'], 'proximic') !== false || - strpos($_SERVER['HTTP_USER_AGENT'], 'Yeti/1.0') !== false || - strpos($_SERVER['HTTP_USER_AGENT'], 'Eurobot/1.0') !== false || - strpos($_SERVER['HTTP_USER_AGENT'], 'MnoGoSearch/') !== false || - strpos($_SERVER['HTTP_USER_AGENT'], 'ia_archiver') !== false || - strpos($_SERVER['HTTP_USER_AGENT'], 'Seekbot/1.0') !== false || - strpos($_SERVER['HTTP_USER_AGENT'], 'MyEngines-Bot') !== false || - strpos($_SERVER['HTTP_USER_AGENT'], 'larbin_') !== false || - strpos($_SERVER['HTTP_USER_AGENT'], 'findlinks/1') !== false || - strpos($_SERVER['HTTP_USER_AGENT'], 'holmes/3.12') !== false || - strpos($_SERVER['HTTP_USER_AGENT'], 'NoteworthyBot/0.1') !== false || - strpos($_SERVER['HTTP_USER_AGENT'], 'Eurosoft-Bot') !== false || - strpos($_SERVER['HTTP_USER_AGENT'], 'Eurobot/1') !== false || - strpos($_SERVER['HTTP_USER_AGENT'], 'Yandex') !== false || - strpos($_SERVER['HTTP_USER_AGENT'], 'msnbot') !== false || - strpos($_SERVER['HTTP_USER_AGENT'], 'Twiceler-0.9') !== false || - strpos($_SERVER['HTTP_USER_AGENT'], 'ScoutJet') !== false || - strpos($_SERVER['HTTP_USER_AGENT'], 'penthesilea/0.3') !== false || - strpos($_SERVER['HTTP_USER_AGENT'], 'VisBot/2.0') !== false || - strpos($_SERVER['HTTP_USER_AGENT'], 'Netluchs/Nutch-1.0') !== false || - strpos($_SERVER['HTTP_USER_AGENT'], 'Yanga WorldSearch Bot') !== false || - strpos($_SERVER['HTTP_USER_AGENT'], 'Semager') !== false || - strpos($_SERVER['HTTP_USER_AGENT'], 'DotBot') !== false || - strpos($_SERVER['HTTP_USER_AGENT'], 'Baiduspider') !== false || - strpos($_SERVER['HTTP_USER_AGENT'], 'MJ12bot/') !== false || - strpos($_SERVER['HTTP_USER_AGENT'], 'Yahoo-MMCrawler') !== false || - strpos($_SERVER['HTTP_USER_AGENT'], 'Charlotte/1.1') !== false || - strpos($_SERVER['HTTP_USER_AGENT'], 'Gaisbot/3.0') !== false || - strpos($_SERVER['HTTP_USER_AGENT'], 'psbot/0.1') !== false || - strpos($_SERVER['HTTP_USER_AGENT'], 'Ask Jeeves/Teoma') !== false || - strpos($_SERVER['HTTP_USER_AGENT'], 'SapphireWebCrawler/Nutch') !== false || - strpos($_SERVER['HTTP_USER_AGENT'], 'kalooga/KaloogaBot') !== false || - strpos($_SERVER['HTTP_USER_AGENT'], 'SimilarPages/Nutch-1.0-dev') !== false || - strpos($_SERVER['HTTP_USER_AGENT'], 'Cogentbot/1.') !== false || - strpos($_SERVER['HTTP_USER_AGENT'], 'CatchBot/1.0') !== false || - strpos($_SERVER['HTTP_USER_AGENT'], 'GingerCrawler/1.0') !== false || - strpos($_SERVER['HTTP_USER_AGENT'], 'yacybot') !== false || - strpos($_SERVER['HTTP_USER_AGENT'], 'www.yacy.net') !== false || - strpos($_SERVER['HTTP_USER_AGENT'], 'Nutch-1.0') !== false || - strpos($_SERVER['HTTP_USER_AGENT'], 'Tagoobot/3.0') !== false || - strpos($_SERVER['HTTP_USER_AGENT'], 'SapphireWebCrawler/1.0') !== false || - strpos($_SERVER['HTTP_USER_AGENT'], 'BotOnParade') !== false || - strpos($_SERVER['HTTP_USER_AGENT'], 'SurveyBot/2.3') !== false || - strpos($_SERVER['HTTP_USER_AGENT'], 'XmarksFetch/1.0') !== false || - strpos($_SERVER['HTTP_USER_AGENT'], 'spbot/2.0') !== false || - strpos($_SERVER['HTTP_USER_AGENT'], 'TinEye/1.1') !== false || - strpos($_SERVER['HTTP_USER_AGENT'], 'TurnitinBot/2.1') !== false || - strpos($_SERVER['HTTP_USER_AGENT'], 'CligooRobot/1.0') !== false || - strpos($_SERVER['HTTP_USER_AGENT'], 'libwww-perl') !== false || - strpos($_SERVER['HTTP_USER_AGENT'], 'Wget') !== false || - strpos($_SERVER['HTTP_USER_AGENT'], 'Python-urllib') !== false || - strpos($_SERVER['HTTP_USER_AGENT'], 'CamontSpider/1.0') !== false || - strpos($_SERVER['HTTP_USER_AGENT'], 'TwengaBot-Discover') !== false || - strpos($_SERVER['HTTP_USER_AGENT'], 'Hailoobot/1.2') !== false || - strpos($_SERVER['HTTP_USER_AGENT'], 'bingbot/2.0') !== false || - strpos($_SERVER['HTTP_USER_AGENT'], 'Exabot/3.0') !== false || - strpos($_SERVER['HTTP_USER_AGENT'], 'spbot/2.1') !== false || - strpos($_SERVER['HTTP_USER_AGENT'], 'Search17Bot/1') !== false || - strpos($_SERVER['HTTP_USER_AGENT'], 'Linguee Bot') !== false || - strpos($_SERVER['HTTP_USER_AGENT'], 'findlinks/2') !== false || - strpos($_SERVER['HTTP_USER_AGENT'], 'SiteBot/0.1') !== false || - strpos($_SERVER['HTTP_USER_AGENT'], 'Purebot/1.1') !== false || - strpos($_SERVER['HTTP_USER_AGENT'], 'Ezooms/1.0') !== false || - strpos($_SERVER['HTTP_USER_AGENT'], 'discobot/1.1') !== false || - strpos($_SERVER['HTTP_USER_AGENT'], 'AhrefsBot/1.0') !== false || - strpos($_SERVER['HTTP_USER_AGENT'], 'suggybot v0.01a') !== false || - strpos($_SERVER['HTTP_USER_AGENT'], 'DomainCrawler/2.0') !== false || - strpos($_SERVER['HTTP_USER_AGENT'], 'Plukkie/1') !== false || - strpos($_SERVER['HTTP_USER_AGENT'], '/Nutch-') !== false || - strpos($_SERVER['HTTP_USER_AGENT'], 'aiHitBot/1.0') !== false || - strpos($_SERVER['HTTP_USER_AGENT'], 'AhrefsBot/') !== false || - strpos($_SERVER['HTTP_USER_AGENT'], 'Crawler') !== false || - strpos($_SERVER['HTTP_USER_AGENT'], 'Acoon') !== false || - strpos($_SERVER['HTTP_USER_AGENT'], 'aiHitBot') !== false || - strpos($_SERVER['HTTP_USER_AGENT'], 'SeznamBot') !== false || - strpos($_SERVER['HTTP_USER_AGENT'], 'CareerBot/1') !== false || - strpos($_SERVER['HTTP_USER_AGENT'], 'SiteExplorer/1') !== false || - strpos($_SERVER['HTTP_USER_AGENT'], 'discoverybot/2.0') !== false || - strpos($_SERVER['HTTP_USER_AGENT'], 'SemrushBot/') !== false || - strpos($_SERVER['HTTP_USER_AGENT'], 'archive.org_bot') !== false || - strpos($_SERVER['HTTP_USER_AGENT'], 'waybackarchive.org/1.0') !== false || - strpos($_SERVER['HTTP_USER_AGENT'], 'spbot/4.0') !== false || - strpos($_SERVER['HTTP_USER_AGENT'], 'Mail.RU_Bot/2.0') !== false || - strpos($_SERVER['HTTP_USER_AGENT'], 'linkdexbot/2.0') !== false || - strpos($_SERVER['HTTP_USER_AGENT'], 'webmeup-crawler.com') !== false || - strpos($_SERVER['HTTP_USER_AGENT'], 'Cityreview Robot') !== false) { - if (isset($_SESSION['uid'])) - $_SESSION['robot'] = true; - return true; + $known_spiders = array('Yahoo! Slurp', + 'Googlebot', + 'Mediapartners-Google', + 'VoilaBot', + 'Gigabot/3.0', + 'Speedy Spider', + 'LinkWalker/2.0', + 'proximic', + 'Yeti/1.0', + 'Eurobot/1.0', + 'MnoGoSearch/', + 'ia_archiver', + 'Seekbot/1.0', + 'MyEngines-Bot', + 'larbin_', + 'findlinks/1', + 'holmes/3.12', + 'NoteworthyBot/0.1', + 'Eurosoft-Bot', + 'Eurobot/1', + 'Yandex', + 'msnbot', + 'Twiceler-0.9', + 'ScoutJet', + 'penthesilea/0.3', + 'VisBot/2.0', + 'Netluchs/Nutch-1.0', + 'Yanga WorldSearch Bot', + 'Semager', + 'DotBot', + 'Baiduspider', + 'MJ12bot/', + 'Yahoo-MMCrawler', + 'Charlotte/1.1', + 'Gaisbot/3.0', + 'psbot/0.1', + 'Ask Jeeves/Teoma', + 'SapphireWebCrawler/Nutch', + 'kalooga/KaloogaBot', + 'SimilarPages/Nutch-1.0-dev', + 'Cogentbot/1.', + 'CatchBot/1.0', + 'GingerCrawler/1.0', + 'yacybot', + 'www.yacy.net', + 'Nutch-1.0', + 'Tagoobot/3.0', + 'SapphireWebCrawler/1.0', + 'BotOnParade', + 'SurveyBot/2.3', + 'XmarksFetch/1.0', + 'spbot/', + 'TinEye/1.1', + 'TurnitinBot/2.1', + 'CligooRobot/1.0', + 'libwww-perl', + 'Wget', + 'Python-urllib', + 'CamontSpider/1.0', + 'TwengaBot-Discover', + 'Hailoobot/1.2', + 'bingbot/2.0', + 'Exabot/3.0', + 'spbot/2.1', + 'Search17Bot/1', + 'Linguee Bot', + 'findlinks/2', + 'SiteBot/0.1', + 'Purebot/1.1', + 'Ezooms/1.0', + 'discobot/1.1', + 'AhrefsBot/1.0', + 'suggybot v0.01a', + 'DomainCrawler/2.0', + 'Plukkie/1', + '/Nutch-', + 'aiHitBot/1.0', + 'AhrefsBot/', + 'Crawler', + 'Acoon', + 'aiHitBot', + 'SeznamBot', + 'CareerBot/1', + 'SiteExplorer/1', + 'discoverybot/2.0', + 'SemrushBot/', + 'archive.org_bot', + 'waybackarchive.org/1.0', + 'spbot/4.0', + 'Mail.RU_Bot/2.0', + 'linkdexbot/2.0', + 'webmeup-crawler.com', + 'meanpathbot/', + 'SearchmetricsBot', + 'publiclibraryarchive.org/', + 'memoryBot/1', + 'Lipperhey SEO Service', + 'Sitedomain-Bot', + 'seoscanners.net/', + 'Qwantify/', + 'WBSearchBot/', + 'Seobility', + 'MetaJobBot', + 'BellPagesCA/1.0', + 'Cityreview Robot'); + + foreach ($known_spiders as $spider_id) { + if (strpos($_SERVER['HTTP_USER_AGENT'], $spider_id) !== false) { + if (isset($_SESSION['uid'])) + $_SESSION['robot'] = true; + return true; + } } if (isset($_SESSION['uid']))