MDL-50891 useragent: Move web crawler checks to useragent class
authorAndrew Nicols <andrew@nicols.co.uk>
Wed, 5 Aug 2015 07:20:23 +0000 (15:20 +0800)
committerAndrew Nicols <andrew@nicols.co.uk>
Fri, 28 Aug 2015 06:31:51 +0000 (14:31 +0800)
lib/classes/session/manager.php
lib/classes/useragent.php
lib/deprecatedlib.php
lib/setuplib.php
lib/tests/setuplib_test.php
lib/tests/useragent_test.php

index b1a5e16..d565a22 100644 (file)
@@ -377,7 +377,7 @@ class manager {
         $user = null;
 
         if (!empty($CFG->opentogoogle)) {
-            if (is_web_crawler()) {
+            if (\core_useragent::is_web_crawler()) {
                 $user = guest_user();
             }
             $referer = get_local_referer(false);
index 29f1a07..fcc3c6d 100644 (file)
@@ -71,7 +71,7 @@ class core_useragent {
         self::DEVICETYPE_DEFAULT,
         self::DEVICETYPE_LEGACY,
         self::DEVICETYPE_MOBILE,
-        self::DEVICETYPE_TABLET
+        self::DEVICETYPE_TABLET,
     );
 
     /**
@@ -201,6 +201,7 @@ class core_useragent {
 
     /**
      * Returns true if the user appears to be on a tablet.
+     *
      * @return int
      */
     protected function is_useragent_tablet() {
@@ -208,6 +209,16 @@ class core_useragent {
         return (preg_match($tabletregex, $this->useragent));
     }
 
+    /**
+     * Whether the user agent relates to a web crawler.
+     * This includes all types of web crawler.
+     * @return bool
+     */
+    protected function is_useragent_web_crawler() {
+        $regex = '/Googlebot|google\.com|Yahoo! Slurp|\[ZSEBOT\]|msnbot|bingbot|BingPreview|Yandex|AltaVista|Baiduspider|Teoma/';
+        return (preg_match($regex, $this->useragent));
+    }
+
     /**
      * Gets a list of known device types.
      *
@@ -926,4 +937,15 @@ class core_useragent {
         // This browser does not support json.
         return false;
     }
+
+    /**
+     * Returns true if the client appears to be some kind of web crawler.
+     * This may include other types of crawler.
+     *
+     * @return bool
+     */
+    public static function is_web_crawler() {
+        $instance = self::instance();
+        return (bool) $instance->is_useragent_web_crawler();
+    }
 }
index 8c85c9d..68b79dc 100644 (file)
@@ -1151,7 +1151,7 @@ function navmenu($course, $cm=NULL, $targetwindow='self') {
 /**
  * @deprecated please use calendar_event::create() instead.
  */
- function add_event($event) {
+function add_event($event) {
     throw new coding_exception('add_event() can not be used any more, please use calendar_event::create() instead.');
 }
 
@@ -2378,4 +2378,21 @@ function get_referer($stripquery = true) {
     } else {
         return '';
     }
-}
\ No newline at end of file
+}
+/**
+ * Checks if current user is a web crawler.
+ *
+ * This list can not be made complete, this is not a security
+ * restriction, we make the list only to help these sites
+ * especially when automatic guest login is disabled.
+ *
+ * If admin needs security they should enable forcelogin
+ * and disable guest access!!
+ *
+ * @return bool
+ * @deprecated since Moodle 3.0 use \core_useragent::is_web_crawler instead.
+ */
+function is_web_crawler() {
+    debugging("is_web_crawler() has been deprecated, please use \\core_useragent\\is_web_crawler() instead.", DEBUG_DEVELOPER);
+    return core_useragent::is_crawler();
+}
index cf5df35..4b165b5 100644 (file)
@@ -1692,45 +1692,6 @@ function make_localcache_directory($directory, $exceptiononerror = true) {
     return make_writable_directory("$CFG->localcachedir/$directory", $exceptiononerror);
 }
 
-/**
- * Checks if current user is a web crawler.
- *
- * This list can not be made complete, this is not a security
- * restriction, we make the list only to help these sites
- * especially when automatic guest login is disabled.
- *
- * If admin needs security they should enable forcelogin
- * and disable guest access!!
- *
- * @return bool
- */
-function is_web_crawler() {
-    if (!empty($_SERVER['HTTP_USER_AGENT'])) {
-        if (strpos($_SERVER['HTTP_USER_AGENT'], 'Googlebot') !== false ) {
-            return true;
-        } else if (strpos($_SERVER['HTTP_USER_AGENT'], 'google.com') !== false ) { // Google
-            return true;
-        } else if (strpos($_SERVER['HTTP_USER_AGENT'], 'Yahoo! Slurp') !== false ) {  // Yahoo
-            return true;
-        } else if (strpos($_SERVER['HTTP_USER_AGENT'], '[ZSEBOT]') !== false ) {  // Zoomspider
-            return true;
-        } else if (stripos($_SERVER['HTTP_USER_AGENT'], 'msnbot') !== false ) {  // MSN Search
-            return true;
-        } else if (strpos($_SERVER['HTTP_USER_AGENT'], 'bingbot') !== false ) {  // Bing
-            return true;
-        } else if (strpos($_SERVER['HTTP_USER_AGENT'], 'Yandex') !== false ) {
-            return true;
-        } else if (strpos($_SERVER['HTTP_USER_AGENT'], 'AltaVista') !== false ) {
-            return true;
-        } else if (stripos($_SERVER['HTTP_USER_AGENT'], 'baiduspider') !== false ) {  // Baidu
-            return true;
-        } else if (strpos($_SERVER['HTTP_USER_AGENT'], 'Teoma') !== false ) {  // Ask.com
-            return true;
-        }
-    }
-    return false;
-}
-
 /**
  * This class solves the problem of how to initialise $OUTPUT.
  *
index bda221b..b46cd64 100644 (file)
@@ -73,53 +73,6 @@ class core_setuplib_testcase extends advanced_testcase {
                 get_docs_url('%%WWWROOT%%/lib/tests/setuplib_test.php'));
     }
 
-    public function test_is_web_crawler() {
-        $browsers = array(
-            'Mozilla/5.0 (Windows; U; MSIE 9.0; WIndows NT 9.0; en-US))',
-            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:18.0) Gecko/18.0 Firefox/18.0',
-            'Mozilla/5.0 (Macintosh; U; PPC Mac OS X; en) AppleWebKit/412 (KHTML, like Gecko) Safari/412',
-            'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_5; en-US) AppleWebKit/534.10 (KHTML, like Gecko) Chrome/8.0.552.215 Safari/534.10',
-            'Opera/9.0 (Windows NT 5.1; U; en)',
-            'Mozilla/5.0 (Linux; U; Android 2.1; en-us; Nexus One Build/ERD62) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17 â€“Nexus',
-            'Mozilla/5.0 (iPad; U; CPU OS 4_2_1 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8C148 Safari/6533.18.5',
-        );
-        $crawlers = array(
-            // Google.
-            'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)',
-            'Googlebot/2.1 (+http://www.googlebot.com/bot.html)',
-            'Googlebot-Image/1.0',
-            // Yahoo.
-            'Mozilla/5.0 (compatible; Yahoo! Slurp; http://help.yahoo.com/help/us/ysearch/slurp)',
-            // Bing.
-            'Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)',
-            'Mozilla/5.0 (compatible; bingbot/2.0 +http://www.bing.com/bingbot.htm)',
-            // MSN.
-            'msnbot/2.1',
-            // Yandex.
-            'Mozilla/5.0 (compatible; YandexBot/3.0; +http://yandex.com/bots)',
-            'Mozilla/5.0 (compatible; YandexImages/3.0; +http://yandex.com/bots)',
-            // AltaVista.
-            'AltaVista V2.0B crawler@evreka.com',
-            // ZoomSpider.
-            'ZoomSpider - wrensoft.com [ZSEBOT]',
-            // Baidu.
-            'Baiduspider+(+http://www.baidu.com/search/spider_jp.html)',
-            'Baiduspider+(+http://www.baidu.com/search/spider.htm)',
-            'BaiDuSpider',
-            // Ask.com.
-            'User-Agent: Mozilla/2.0 (compatible; Ask Jeeves/Teoma)',
-        );
-
-        foreach ($browsers as $agent) {
-            $_SERVER['HTTP_USER_AGENT'] = $agent;
-            $this->assertFalse(is_web_crawler());
-        }
-        foreach ($crawlers as $agent) {
-            $_SERVER['HTTP_USER_AGENT'] = $agent;
-            $this->assertTrue(is_web_crawler(), "$agent should be considered a search engine");
-        }
-    }
-
     /**
      * Test if get_exception_info() removes file system paths.
      */
index b4b230c..cb7ac3d 100644 (file)
@@ -1091,6 +1091,235 @@ class core_useragent_testcase extends basic_testcase {
                     'supports_svg'                  => false,
                ),
             ),
+
+            // Google web crawlers.
+            array(
+                'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)',
+                array(
+                    'is_web_crawler'                => true,
+                    'versionclasses'                => array(
+                    ),
+               ),
+            ),
+            array(
+                'Googlebot/2.1 (+http://www.googlebot.com/bot.html)',
+                array(
+                    'is_web_crawler'                => true,
+                    'versionclasses'                => array(
+                    ),
+               ),
+            ),
+            array(
+                'Googlebot-Image/1.0',
+                array(
+                    'is_web_crawler'                => true,
+                    'versionclasses'                => array(
+                    ),
+               ),
+            ),
+
+            // Yahoo crawlers.
+            // See https://help.yahoo.com/kb/slurp-crawling-page-sln22600.html.
+            array(
+                'Mozilla/5.0 (compatible; Yahoo! Slurp; http://help.yahoo.com/help/us/ysearch/slurp)',
+                array(
+                    'is_web_crawler'                => true,
+                    'versionclasses'                => array(
+                    ),
+               ),
+            ),
+
+            // Bing / MSN / AdIdx crawlers.
+            // See http://www.bing.com/webmaster/help/which-crawlers-does-bing-use-8c184ec0.
+            array(
+                'Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)',
+                array(
+                    'is_web_crawler'                => true,
+                    'versionclasses'                => array(
+                    ),
+               ),
+            ),
+            array(
+                'Mozilla/5.0 (compatible; bingbot/2.0 +http://www.bing.com/bingbot.htm)',
+                array(
+                    'is_web_crawler'                => true,
+                    'versionclasses'                => array(
+                    ),
+               ),
+            ),
+            array(
+                'Mozilla/5.0 (iPhone; CPU iPhone OS 7_0 like Mac OS X) AppleWebKit/537.51.1 (KHTML, like Gecko) Version/7.0 Mobile/11A465 Safari/9537.53 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)',
+                array(
+                    'is_web_crawler'                => true,
+                    'is_webkit'                     => true,
+                    'is_safari_ios'                 => true,
+                    'check_safari_ios_version'      => array(
+                        '527'                       => true,
+                    ),
+
+                    'versionclasses'                => array(
+                        'safari',
+                        'ios',
+                    ),
+
+                    'devicetype'                    => 'mobile',
+               ),
+            ),
+            array(
+                'Mozilla/5.0 (Windows Phone 8.1; ARM; Trident/7.0; Touch; rv:11.0; IEMobile/11.0; NOKIA; Lumia 530) like Gecko (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)',
+                array(
+                    'is_web_crawler'                => true,
+                    'is_ie'                         => true,
+                    'check_ie_version'              => array(
+                        '0'                         => true,
+                        '5.0'                       => true,
+                        '5.5'                       => true,
+                        '6.0'                       => true,
+                        '7.0'                       => true,
+                        '8.0'                       => true,
+                        '9.0'                       => true,
+                        '10'                        => true,
+                        '11'                        => true,
+                    ),
+                    'versionclasses'                => array(
+                        'ie',
+                        'ie11',
+                    ),
+                    'devicetype'                    => 'mobile',
+               ),
+            ),
+
+            array(
+                'msnbot/2.0b (+http://search.msn.com/msnbot.htm)',
+                array(
+                    'is_web_crawler'                => true,
+                    'versionclasses'                => array(
+                    ),
+               ),
+            ),
+            array(
+                'msnbot/2.1',
+                array(
+                    'is_web_crawler'                => true,
+                    'versionclasses'                => array(
+                    ),
+               ),
+            ),
+            array(
+                'msnbot-media/1.1 (+http://search.msn.com/msnbot.htm)',
+                array(
+                    'is_web_crawler'                => true,
+                    'versionclasses'                => array(
+                    ),
+               ),
+            ),
+            array(
+                'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534+ (KHTML, like Gecko) BingPreview/1.0b',
+                array(
+                    'is_web_crawler'                => true,
+                    'is_webkit'                     => true,
+                    'is_safari'                     => true,
+                    'check_safari_version'          => array(
+                        '1'                         => true,
+                        '312'                       => true,
+                        '500'                       => true,
+                    ),
+
+                    'versionclasses'                => array(
+                        'safari',
+                    ),
+               ),
+            ),
+            array(
+                'Mozilla/5.0 (Windows Phone 8.1; ARM; Trident/7.0; Touch; rv:11.0; IEMobile/11.0; NOKIA; Lumia 530) like Gecko BingPreview/1.0b',
+                array(
+                    'is_web_crawler'                => true,
+                    'is_ie'                         => true,
+                    'check_ie_version'              => array(
+                        '0'                         => true,
+                        '5.0'                       => true,
+                        '5.5'                       => true,
+                        '6.0'                       => true,
+                        '7.0'                       => true,
+                        '8.0'                       => true,
+                        '9.0'                       => true,
+                        '10'                        => true,
+                        '11'                        => true,
+                    ),
+                    'versionclasses'                => array(
+                        'ie',
+                        'ie11',
+                    ),
+                    'devicetype'                    => 'mobile',
+               ),
+            ),
+
+            // Yandex.
+            // See http://help.yandex.com/search/robots/agent.xml.
+            array(
+                'Mozilla/5.0 (compatible; YandexBot/3.0; +http://yandex.com/bots)',
+                array(
+                    'is_web_crawler'                => true,
+                    'versionclasses'                => array(
+                    ),
+               ),
+            ),
+            array(
+                'Mozilla/5.0 (compatible; YandexImages/3.0; +http://yandex.com/bots)',
+                array(
+                    'is_web_crawler'                => true,
+                    'versionclasses'                => array(
+                    ),
+               ),
+            ),
+
+            // AltaVista.
+            array(
+                'AltaVista V2.0B crawler@evreka.com',
+                array(
+                    'is_web_crawler'                => true,
+                    'versionclasses'                => array(
+                    ),
+               ),
+            ),
+
+            // ZoomSpider.
+            array(
+                'ZoomSpider - wrensoft.com [ZSEBOT]',
+                array(
+                    'is_web_crawler'                => true,
+                    'versionclasses'                => array(
+                    ),
+               ),
+            ),
+
+            // Baidu.
+            array(
+                'Baiduspider+(+http://www.baidu.com/search/spider_jp.html)',
+                array(
+                    'is_web_crawler'                => true,
+                    'versionclasses'                => array(
+                    ),
+               ),
+            ),
+            array(
+                'Baiduspider+(+http://www.baidu.com/search/spider.htm)',
+                array(
+                    'is_web_crawler'                => true,
+                    'versionclasses'                => array(
+                    ),
+               ),
+            ),
+
+            // Ask.com.
+            array(
+                'User-Agent: Mozilla/2.0 (compatible; Ask Jeeves/Teoma)',
+                array(
+                    'is_web_crawler'                => true,
+                    'versionclasses'                => array(
+                    ),
+               ),
+            ),
         );
     }
 
@@ -1525,4 +1754,15 @@ class core_useragent_testcase extends basic_testcase {
         }
         $this->assertCount(count($tests['versionclasses']), $actual);
     }
+
+    /**
+     * @dataProvider user_agents_providers
+     */
+    public function test_useragent_web_crawler($useragent, $tests) {
+        // Setup the core_useragent instance.
+        core_useragent::instance(true, $useragent);
+
+        $expectation = isset($tests['is_web_crawler']) ? $tests['is_web_crawler'] : false;
+        $this->assertSame($expectation, core_useragent::is_web_crawler());
+    }
 }