From 8d958522525d6640d175ee62240675e56ea4b4e2 Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Sat, 27 May 2017 05:21:06 +0200 Subject: [PATCH] robots: throw an error 403 to bots ignoring robots.txt --- web/index.php | 15 ++++++++++++--- web/robots.txt | 2 +- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/web/index.php b/web/index.php index ef8d5e8..6d7bd82 100644 --- a/web/index.php +++ b/web/index.php @@ -11,9 +11,18 @@ if(isset($_SERVER['HTTP_X_FORWARDED_FOR'])) //$_SERVER['PATH_INFO'] = $_SERVER['REQUEST_URI']; -if(0) -{ - debuglog("{$_SERVER['REMOTE_ADDR']}, {$_SERVER['REQUEST_URI']}"); +// blacklist some search bots which ignore robots.txt (most in fact) +$isbot = false; $agent = arraySafeVal($_SERVER,'HTTP_USER_AGENT',''); +if (strpos($agent, 'MJ12bot') || strpos($agent, 'DotBot') || strpos($agent, 'robot')) + $isbot = true; +else if (strpos($agent, 'AhrefsBot') || strpos($agent, 'YandexBot') || strpos($agent, 'Googlebot')) + $isbot = true; + +if ($isbot) { + $url = arraySafeVal($_SERVER,'REQUEST_URI'); + if (strpos($url, "explorer")) + throw new CHttpException(403,"You are not wanted on this server. see robots.txt"); + die(); } try diff --git a/web/robots.txt b/web/robots.txt index a82d96e..70c2374 100644 --- a/web/robots.txt +++ b/web/robots.txt @@ -1,2 +1,2 @@ User-agent: * -Disallow: +Disallow: /