MDL-38170 SimplePie: Cannot read https feeds through proxy
authorsam marshall <s.marshall@open.ac.uk>
Fri, 22 Feb 2013 17:23:01 +0000 (17:23 +0000)
committersam marshall <s.marshall@open.ac.uk>
Fri, 1 Mar 2013 10:35:31 +0000 (10:35 +0000)
lib/filelib.php
lib/simplepie/moodle_simplepie.php
lib/tests/filelib_test.php

index 77cf104..49ee9ce 100644 (file)
@@ -3369,6 +3369,45 @@ class curl {
     public function get_errno() {
         return $this->errno;
     }
+
+    /**
+     * When using a proxy, an additional HTTP response code may appear at
+     * the start of the header. For example, when using https over a proxy
+     * there may be 'HTTP/1.0 200 Connection Established'. Other codes are
+     * also possible and some may come with their own headers.
+     *
+     * If using the return value containing all headers, this function can be
+     * called to remove unwanted doubles.
+     *
+     * Note that it is not possible to distinguish this situation from valid
+     * data unless you know the actual response part (below the headers)
+     * will not be included in this string, or else will not 'look like' HTTP
+     * headers. As a result it is not safe to call this function for general
+     * data.
+     *
+     * @param string $input Input HTTP response
+     * @return string HTTP response with additional headers stripped if any
+     */
+    public static function strip_double_headers($input) {
+        // I have tried to make this regular expression as specific as possible
+        // to avoid any case where it does weird stuff if you happen to put
+        // HTTP/1.1 200 at the start of any line in your RSS file. This should
+        // also make it faster because it can abandon regex processing as soon
+        // as it hits something that doesn't look like an http header. The
+        // header definition is taken from RFC 822, except I didn't support
+        // folding which is never used in practice.
+        $crlf = "\r\n";
+        return preg_replace(
+                // HTTP version and status code (ignore value of code).
+                '~^HTTP/1\..*' . $crlf .
+                // Header name: character between 33 and 126 decimal, except colon.
+                // Colon. Header value: any character except \r and \n. CRLF.
+                '(?:[\x21-\x39\x3b-\x7e]+:[^' . $crlf . ']+' . $crlf . ')*' .
+                // Headers are terminated by another CRLF (blank line).
+                $crlf .
+                // Second HTTP status code, this time must be 200.
+                '(HTTP/1.[01] 200 )~', '$1', $input);
+    }
 }
 
 /**
index 54e2d07..5fe2755 100644 (file)
@@ -139,7 +139,7 @@ class moodle_simplepie_file extends SimplePie_File
             }
         }
 
-        $this->headers = $curl->get($url);
+        $this->headers = curl::strip_double_headers($curl->get($url));
 
         if ($curl->error) {
             $this->error = 'cURL Error: '.$curl->error;
index 53bc317..7365a44 100644 (file)
@@ -270,4 +270,101 @@ class filelib_testcase extends advanced_testcase {
         $this->assertEquals($contenthash, $fileref->get_contenthash());
         $this->assertEquals($filecontent, $fileref->get_content());
     }
+
+    /**
+     * Tests the strip_double_headers function in the curl class.
+     */
+    public function test_curl_strip_double_headers() {
+        // Example from issue tracker.
+        $mdl30648example = <<<EOF
+HTTP/1.0 407 Proxy Authentication Required
+Server: squid/2.7.STABLE9
+Date: Thu, 08 Dec 2011 14:44:33 GMT
+Content-Type: text/html
+Content-Length: 1275
+X-Squid-Error: ERR_CACHE_ACCESS_DENIED 0
+Proxy-Authenticate: Basic realm="Squid proxy-caching web server"
+X-Cache: MISS from homer.lancs.ac.uk
+X-Cache-Lookup: NONE from homer.lancs.ac.uk:3128
+Via: 1.0 homer.lancs.ac.uk:3128 (squid/2.7.STABLE9)
+Connection: close
+
+HTTP/1.0 200 OK
+Server: Apache
+X-Lb-Nocache: true
+Cache-Control: private, max-age=15
+ETag: "4d69af5d8ba873ea9192c489e151bd7b"
+Content-Type: text/html
+Date: Thu, 08 Dec 2011 14:44:53 GMT
+Set-Cookie: BBC-UID=c4de2e109c8df6a51de627cee11b214bd4fb6054a030222488317afb31b343360MoodleBot/1.0; expires=Mon, 07-Dec-15 14:44:53 GMT; path=/; domain=bbc.co.uk
+X-Cache-Action: MISS
+X-Cache-Age: 0
+Vary: Cookie,X-Country,X-Ip-is-uk-combined,X-Ip-is-advertise-combined,X-Ip_is_uk_combined,X-Ip_is_advertise_combined, X-GeoIP
+X-Cache: MISS from ww
+
+<html>...
+EOF;
+        $mdl30648expected = <<<EOF
+HTTP/1.0 200 OK
+Server: Apache
+X-Lb-Nocache: true
+Cache-Control: private, max-age=15
+ETag: "4d69af5d8ba873ea9192c489e151bd7b"
+Content-Type: text/html
+Date: Thu, 08 Dec 2011 14:44:53 GMT
+Set-Cookie: BBC-UID=c4de2e109c8df6a51de627cee11b214bd4fb6054a030222488317afb31b343360MoodleBot/1.0; expires=Mon, 07-Dec-15 14:44:53 GMT; path=/; domain=bbc.co.uk
+X-Cache-Action: MISS
+X-Cache-Age: 0
+Vary: Cookie,X-Country,X-Ip-is-uk-combined,X-Ip-is-advertise-combined,X-Ip_is_uk_combined,X-Ip_is_advertise_combined, X-GeoIP
+X-Cache: MISS from ww
+
+<html>...
+EOF;
+        // For HTTP, replace the \n with \r\n.
+        $mdl30648example = preg_replace("~(?!<\r)\n~", "\r\n", $mdl30648example);
+        $mdl30648expected = preg_replace("~(?!<\r)\n~", "\r\n", $mdl30648expected);
+
+        // Test stripping works OK.
+        $this->assertEquals($mdl30648expected, curl::strip_double_headers($mdl30648example));
+        // Test it does nothing to the 'plain' data.
+        $this->assertEquals($mdl30648expected, curl::strip_double_headers($mdl30648expected));
+
+        // Example from OU proxy.
+        $httpsexample = <<<EOF
+HTTP/1.0 200 Connection established
+
+HTTP/1.1 200 OK
+Date: Fri, 22 Feb 2013 17:14:23 GMT
+Server: Apache/2
+X-Powered-By: PHP/5.3.3-7+squeeze14
+Content-Type: text/xml
+Connection: close
+Content-Encoding: gzip
+Transfer-Encoding: chunked
+
+<?xml version="1.0" encoding="ISO-8859-1" ?>
+<rss version="2.0">...
+EOF;
+        $httpsexpected = <<<EOF
+HTTP/1.1 200 OK
+Date: Fri, 22 Feb 2013 17:14:23 GMT
+Server: Apache/2
+X-Powered-By: PHP/5.3.3-7+squeeze14
+Content-Type: text/xml
+Connection: close
+Content-Encoding: gzip
+Transfer-Encoding: chunked
+
+<?xml version="1.0" encoding="ISO-8859-1" ?>
+<rss version="2.0">...
+EOF;
+        // For HTTP, replace the \n with \r\n.
+        $httpsexample = preg_replace("~(?!<\r)\n~", "\r\n", $httpsexample);
+        $httpsexpected = preg_replace("~(?!<\r)\n~", "\r\n", $httpsexpected);
+
+        // Test stripping works OK.
+        $this->assertEquals($httpsexpected, curl::strip_double_headers($httpsexample));
+        // Test it does nothing to the 'plain' data.
+        $this->assertEquals($httpsexpected, curl::strip_double_headers($httpsexpected));
+    }
 }