| import unittest, StringIO, robotparser | |
| from test import test_support | |
| class RobotTestCase(unittest.TestCase): | |
| def __init__(self, index, parser, url, good, agent): | |
| unittest.TestCase.__init__(self) | |
| if good: | |
| self.str = "RobotTest(%d, good, %s)" % (index, url) | |
| else: | |
| self.str = "RobotTest(%d, bad, %s)" % (index, url) | |
| self.parser = parser | |
| self.url = url | |
| self.good = good | |
| self.agent = agent | |
| def runTest(self): | |
| if isinstance(self.url, tuple): | |
| agent, url = self.url | |
| else: | |
| url = self.url | |
| agent = self.agent | |
| if self.good: | |
| self.assertTrue(self.parser.can_fetch(agent, url)) | |
| else: | |
| self.assertFalse(self.parser.can_fetch(agent, url)) | |
| def __str__(self): | |
| return self.str | |
| tests = unittest.TestSuite() | |
| def RobotTest(index, robots_txt, good_urls, bad_urls, | |
| agent="test_robotparser"): | |
| lines = StringIO.StringIO(robots_txt).readlines() | |
| parser = robotparser.RobotFileParser() | |
| parser.parse(lines) | |
| for url in good_urls: | |
| tests.addTest(RobotTestCase(index, parser, url, 1, agent)) | |
| for url in bad_urls: | |
| tests.addTest(RobotTestCase(index, parser, url, 0, agent)) | |
| # Examples from http://www.robotstxt.org/wc/norobots.html (fetched 2002) | |
| # 1. | |
| doc = """ | |
| User-agent: * | |
| Disallow: /cyberworld/map/ # This is an infinite virtual URL space | |
| Disallow: /tmp/ # these will soon disappear | |
| Disallow: /foo.html | |
| """ | |
| good = ['/','/test.html'] | |
| bad = ['/cyberworld/map/index.html','/tmp/xxx','/foo.html'] | |
| RobotTest(1, doc, good, bad) | |
| # 2. | |
| doc = """ | |
| # robots.txt for http://www.example.com/ | |
| User-agent: * | |
| Disallow: /cyberworld/map/ # This is an infinite virtual URL space | |
| # Cybermapper knows where to go. | |
| User-agent: cybermapper | |
| Disallow: | |
| """ | |
| good = ['/','/test.html',('cybermapper','/cyberworld/map/index.html')] | |
| bad = ['/cyberworld/map/index.html'] | |
| RobotTest(2, doc, good, bad) | |
| # 3. | |
| doc = """ | |
| # go away | |
| User-agent: * | |
| Disallow: / | |
| """ | |
| good = [] | |
| bad = ['/cyberworld/map/index.html','/','/tmp/'] | |
| RobotTest(3, doc, good, bad) | |
| # Examples from http://www.robotstxt.org/wc/norobots-rfc.html (fetched 2002) | |
| # 4. | |
| doc = """ | |
| User-agent: figtree | |
| Disallow: /tmp | |
| Disallow: /a%3cd.html | |
| Disallow: /a%2fb.html | |
| Disallow: /%7ejoe/index.html | |
| """ | |
| good = [] # XFAIL '/a/b.html' | |
| bad = ['/tmp','/tmp.html','/tmp/a.html', | |
| '/a%3cd.html','/a%3Cd.html','/a%2fb.html', | |
| '/~joe/index.html' | |
| ] | |
| RobotTest(4, doc, good, bad, 'figtree') | |
| RobotTest(5, doc, good, bad, 'FigTree Robot libwww-perl/5.04') | |
| # 6. | |
| doc = """ | |
| User-agent: * | |
| Disallow: /tmp/ | |
| Disallow: /a%3Cd.html | |
| Disallow: /a/b.html | |
| Disallow: /%7ejoe/index.html | |
| """ | |
| good = ['/tmp',] # XFAIL: '/a%2fb.html' | |
| bad = ['/tmp/','/tmp/a.html', | |
| '/a%3cd.html','/a%3Cd.html',"/a/b.html", | |
| '/%7Ejoe/index.html'] | |
| RobotTest(6, doc, good, bad) | |
| # From bug report #523041 | |
| # 7. | |
| doc = """ | |
| User-Agent: * | |
| Disallow: /. | |
| """ | |
| good = ['/foo.html'] | |
| bad = [] # Bug report says "/" should be denied, but that is not in the RFC | |
| RobotTest(7, doc, good, bad) | |
| # From Google: http://www.google.com/support/webmasters/bin/answer.py?hl=en&answer=40364 | |
| # 8. | |
| doc = """ | |
| User-agent: Googlebot | |
| Allow: /folder1/myfile.html | |
| Disallow: /folder1/ | |
| """ | |
| good = ['/folder1/myfile.html'] | |
| bad = ['/folder1/anotherfile.html'] | |
| RobotTest(8, doc, good, bad, agent="Googlebot") | |
| # 9. This file is incorrect because "Googlebot" is a substring of | |
| # "Googlebot-Mobile", so test 10 works just like test 9. | |
| doc = """ | |
| User-agent: Googlebot | |
| Disallow: / | |
| User-agent: Googlebot-Mobile | |
| Allow: / | |
| """ | |
| good = [] | |
| bad = ['/something.jpg'] | |
| RobotTest(9, doc, good, bad, agent="Googlebot") | |
| good = [] | |
| bad = ['/something.jpg'] | |
| RobotTest(10, doc, good, bad, agent="Googlebot-Mobile") | |
| # 11. Get the order correct. | |
| doc = """ | |
| User-agent: Googlebot-Mobile | |
| Allow: / | |
| User-agent: Googlebot | |
| Disallow: / | |
| """ | |
| good = [] | |
| bad = ['/something.jpg'] | |
| RobotTest(11, doc, good, bad, agent="Googlebot") | |
| good = ['/something.jpg'] | |
| bad = [] | |
| RobotTest(12, doc, good, bad, agent="Googlebot-Mobile") | |
| # 13. Google also got the order wrong in #8. You need to specify the | |
| # URLs from more specific to more general. | |
| doc = """ | |
| User-agent: Googlebot | |
| Allow: /folder1/myfile.html | |
| Disallow: /folder1/ | |
| """ | |
| good = ['/folder1/myfile.html'] | |
| bad = ['/folder1/anotherfile.html'] | |
| RobotTest(13, doc, good, bad, agent="googlebot") | |
| # 14. For issue #6325 (query string support) | |
| doc = """ | |
| User-agent: * | |
| Disallow: /some/path?name=value | |
| """ | |
| good = ['/some/path'] | |
| bad = ['/some/path?name=value'] | |
| RobotTest(14, doc, good, bad) | |
| # 15. For issue #4108 (obey first * entry) | |
| doc = """ | |
| User-agent: * | |
| Disallow: /some/path | |
| User-agent: * | |
| Disallow: /another/path | |
| """ | |
| good = ['/another/path'] | |
| bad = ['/some/path'] | |
| RobotTest(15, doc, good, bad) | |
| class NetworkTestCase(unittest.TestCase): | |
| def testPasswordProtectedSite(self): | |
| test_support.requires('network') | |
| with test_support.transient_internet('mueblesmoraleda.com'): | |
| url = 'http://mueblesmoraleda.com' | |
| parser = robotparser.RobotFileParser() | |
| parser.set_url(url) | |
| try: | |
| parser.read() | |
| except IOError: | |
| self.skipTest('%s is unavailable' % url) | |
| self.assertEqual(parser.can_fetch("*", url+"/robots.txt"), False) | |
| def testPythonOrg(self): | |
| test_support.requires('network') | |
| with test_support.transient_internet('www.python.org'): | |
| parser = robotparser.RobotFileParser( | |
| "http://www.python.org/robots.txt") | |
| parser.read() | |
| self.assertTrue( | |
| parser.can_fetch("*", "http://www.python.org/robots.txt")) | |
| def test_main(): | |
| test_support.run_unittest(tests) | |
| test_support.run_unittest(NetworkTestCase) | |
| if __name__=='__main__': | |
| test_support.verbose = 1 | |
| test_main() |