AppPkg/Applications/Python/Python-2.7.2/Lib/test/test_robotparser.py - edk2 - Git at Google

 import unittest, StringIO, robotparser
 from test import test_support

 class RobotTestCase(unittest.TestCase):
     def __init__(self, index, parser, url, good, agent):
         unittest.TestCase.__init__(self)
         if good:
             self.str = "RobotTest(%d, good, %s)" % (index, url)
         else:
             self.str = "RobotTest(%d, bad, %s)" % (index, url)
         self.parser = parser
         self.url = url
         self.good = good
         self.agent = agent

     def runTest(self):
         if isinstance(self.url, tuple):
             agent, url = self.url
         else:
             url = self.url
             agent = self.agent
         if self.good:
             self.assertTrue(self.parser.can_fetch(agent, url))
         else:
             self.assertFalse(self.parser.can_fetch(agent, url))

     def __str__(self):
         return self.str

 tests = unittest.TestSuite()

 def RobotTest(index, robots_txt, good_urls, bad_urls,
               agent="test_robotparser"):

     lines = StringIO.StringIO(robots_txt).readlines()
     parser = robotparser.RobotFileParser()
     parser.parse(lines)
     for url in good_urls:
         tests.addTest(RobotTestCase(index, parser, url, 1, agent))
     for url in bad_urls:
         tests.addTest(RobotTestCase(index, parser, url, 0, agent))

 # Examples from http://www.robotstxt.org/wc/norobots.html (fetched 2002)

 # 1.
 doc = """
 User-agent: *
 Disallow: /cyberworld/map/ # This is an infinite virtual URL space
 Disallow: /tmp/ # these will soon disappear
 Disallow: /foo.html
 """

 good = ['/','/test.html']
 bad = ['/cyberworld/map/index.html','/tmp/xxx','/foo.html']

 RobotTest(1, doc, good, bad)

 # 2.
 doc = """
 # robots.txt for http://www.example.com/

 User-agent: *
 Disallow: /cyberworld/map/ # This is an infinite virtual URL space

 # Cybermapper knows where to go.
 User-agent: cybermapper
 Disallow:

 """

 good = ['/','/test.html',('cybermapper','/cyberworld/map/index.html')]
 bad = ['/cyberworld/map/index.html']

 RobotTest(2, doc, good, bad)

 # 3.
 doc = """
 # go away
 User-agent: *
 Disallow: /
 """

 good = []
 bad = ['/cyberworld/map/index.html','/','/tmp/']

 RobotTest(3, doc, good, bad)

 # Examples from http://www.robotstxt.org/wc/norobots-rfc.html (fetched 2002)

 # 4.
 doc = """
 User-agent: figtree
 Disallow: /tmp
 Disallow: /a%3cd.html
 Disallow: /a%2fb.html
 Disallow: /%7ejoe/index.html
 """

 good = [] # XFAIL '/a/b.html'
 bad = ['/tmp','/tmp.html','/tmp/a.html',
        '/a%3cd.html','/a%3Cd.html','/a%2fb.html',
        '/~joe/index.html'
        ]

 RobotTest(4, doc, good, bad, 'figtree')
 RobotTest(5, doc, good, bad, 'FigTree Robot libwww-perl/5.04')

 # 6.
 doc = """
 User-agent: *
 Disallow: /tmp/
 Disallow: /a%3Cd.html
 Disallow: /a/b.html
 Disallow: /%7ejoe/index.html
 """

 good = ['/tmp',] # XFAIL: '/a%2fb.html'
 bad = ['/tmp/','/tmp/a.html',
        '/a%3cd.html','/a%3Cd.html',"/a/b.html",
        '/%7Ejoe/index.html']

 RobotTest(6, doc, good, bad)

 # From bug report #523041

 # 7.
 doc = """
 User-Agent: *
 Disallow: /.
 """

 good = ['/foo.html']
 bad = [] # Bug report says "/" should be denied, but that is not in the RFC

 RobotTest(7, doc, good, bad)

 # From Google: http://www.google.com/support/webmasters/bin/answer.py?hl=en&answer=40364

 # 8.
 doc = """
 User-agent: Googlebot
 Allow: /folder1/myfile.html
 Disallow: /folder1/
 """

 good = ['/folder1/myfile.html']
 bad = ['/folder1/anotherfile.html']

 RobotTest(8, doc, good, bad, agent="Googlebot")

 # 9.  This file is incorrect because "Googlebot" is a substring of
 #     "Googlebot-Mobile", so test 10 works just like test 9.
 doc = """
 User-agent: Googlebot
 Disallow: /

 User-agent: Googlebot-Mobile
 Allow: /
 """

 good = []
 bad = ['/something.jpg']

 RobotTest(9, doc, good, bad, agent="Googlebot")

 good = []
 bad = ['/something.jpg']

 RobotTest(10, doc, good, bad, agent="Googlebot-Mobile")

 # 11.  Get the order correct.
 doc = """
 User-agent: Googlebot-Mobile
 Allow: /

 User-agent: Googlebot
 Disallow: /
 """

 good = []
 bad = ['/something.jpg']

 RobotTest(11, doc, good, bad, agent="Googlebot")

 good = ['/something.jpg']
 bad = []

 RobotTest(12, doc, good, bad, agent="Googlebot-Mobile")


 # 13.  Google also got the order wrong in #8.  You need to specify the
 #      URLs from more specific to more general.
 doc = """
 User-agent: Googlebot
 Allow: /folder1/myfile.html
 Disallow: /folder1/
 """

 good = ['/folder1/myfile.html']
 bad = ['/folder1/anotherfile.html']

 RobotTest(13, doc, good, bad, agent="googlebot")


 # 14. For issue #6325 (query string support)
 doc = """
 User-agent: *
 Disallow: /some/path?name=value
 """

 good = ['/some/path']
 bad = ['/some/path?name=value']

 RobotTest(14, doc, good, bad)

 # 15. For issue #4108 (obey first * entry)
 doc = """
 User-agent: *
 Disallow: /some/path

 User-agent: *
 Disallow: /another/path
 """

 good = ['/another/path']
 bad = ['/some/path']

 RobotTest(15, doc, good, bad)


 class NetworkTestCase(unittest.TestCase):

     def testPasswordProtectedSite(self):
         test_support.requires('network')
         with test_support.transient_internet('mueblesmoraleda.com'):
             url = 'http://mueblesmoraleda.com'
             parser = robotparser.RobotFileParser()
             parser.set_url(url)
             try:
                 parser.read()
             except IOError:
                 self.skipTest('%s is unavailable' % url)
             self.assertEqual(parser.can_fetch("*", url+"/robots.txt"), False)

     def testPythonOrg(self):
         test_support.requires('network')
         with test_support.transient_internet('www.python.org'):
             parser = robotparser.RobotFileParser(
                 "http://www.python.org/robots.txt")
             parser.read()
             self.assertTrue(
                 parser.can_fetch("*", "http://www.python.org/robots.txt"))


 def test_main():
     test_support.run_unittest(tests)
     test_support.run_unittest(NetworkTestCase)

 if __name__=='__main__':
     test_support.verbose = 1
     test_main()
	import unittest, StringIO, robotparser
	from test import test_support

	class RobotTestCase(unittest.TestCase):
	def __init__(self, index, parser, url, good, agent):
	unittest.TestCase.__init__(self)
	if good:
	self.str = "RobotTest(%d, good, %s)" % (index, url)
	else:
	self.str = "RobotTest(%d, bad, %s)" % (index, url)
	self.parser = parser
	self.url = url
	self.good = good
	self.agent = agent

	def runTest(self):
	if isinstance(self.url, tuple):
	agent, url = self.url
	else:
	url = self.url
	agent = self.agent
	if self.good:
	self.assertTrue(self.parser.can_fetch(agent, url))
	else:
	self.assertFalse(self.parser.can_fetch(agent, url))

	def __str__(self):
	return self.str

	tests = unittest.TestSuite()

	def RobotTest(index, robots_txt, good_urls, bad_urls,
	agent="test_robotparser"):

	lines = StringIO.StringIO(robots_txt).readlines()
	parser = robotparser.RobotFileParser()
	parser.parse(lines)
	for url in good_urls:
	tests.addTest(RobotTestCase(index, parser, url, 1, agent))
	for url in bad_urls:
	tests.addTest(RobotTestCase(index, parser, url, 0, agent))

	# Examples from http://www.robotstxt.org/wc/norobots.html (fetched 2002)

	# 1.
	doc = """
	User-agent: *
	Disallow: /cyberworld/map/ # This is an infinite virtual URL space
	Disallow: /tmp/ # these will soon disappear
	Disallow: /foo.html
	"""

	good = ['/','/test.html']
	bad = ['/cyberworld/map/index.html','/tmp/xxx','/foo.html']

	RobotTest(1, doc, good, bad)

	# 2.
	doc = """
	# robots.txt for http://www.example.com/

	User-agent: *
	Disallow: /cyberworld/map/ # This is an infinite virtual URL space

	# Cybermapper knows where to go.
	User-agent: cybermapper
	Disallow:

	"""

	good = ['/','/test.html',('cybermapper','/cyberworld/map/index.html')]
	bad = ['/cyberworld/map/index.html']

	RobotTest(2, doc, good, bad)

	# 3.
	doc = """
	# go away
	User-agent: *
	Disallow: /
	"""

	good = []
	bad = ['/cyberworld/map/index.html','/','/tmp/']

	RobotTest(3, doc, good, bad)

	# Examples from http://www.robotstxt.org/wc/norobots-rfc.html (fetched 2002)

	# 4.
	doc = """
	User-agent: figtree
	Disallow: /tmp
	Disallow: /a%3cd.html
	Disallow: /a%2fb.html
	Disallow: /%7ejoe/index.html
	"""

	good = [] # XFAIL '/a/b.html'
	bad = ['/tmp','/tmp.html','/tmp/a.html',
	'/a%3cd.html','/a%3Cd.html','/a%2fb.html',
	'/~joe/index.html'
	]

	RobotTest(4, doc, good, bad, 'figtree')
	RobotTest(5, doc, good, bad, 'FigTree Robot libwww-perl/5.04')

	# 6.
	doc = """
	User-agent: *
	Disallow: /tmp/
	Disallow: /a%3Cd.html
	Disallow: /a/b.html
	Disallow: /%7ejoe/index.html
	"""

	good = ['/tmp',] # XFAIL: '/a%2fb.html'
	bad = ['/tmp/','/tmp/a.html',
	'/a%3cd.html','/a%3Cd.html',"/a/b.html",
	'/%7Ejoe/index.html']

	RobotTest(6, doc, good, bad)

	# From bug report #523041

	# 7.
	doc = """
	User-Agent: *
	Disallow: /.
	"""

	good = ['/foo.html']
	bad = [] # Bug report says "/" should be denied, but that is not in the RFC

	RobotTest(7, doc, good, bad)

	# From Google: http://www.google.com/support/webmasters/bin/answer.py?hl=en&answer=40364

	# 8.
	doc = """
	User-agent: Googlebot
	Allow: /folder1/myfile.html
	Disallow: /folder1/
	"""

	good = ['/folder1/myfile.html']
	bad = ['/folder1/anotherfile.html']

	RobotTest(8, doc, good, bad, agent="Googlebot")

	# 9. This file is incorrect because "Googlebot" is a substring of
	# "Googlebot-Mobile", so test 10 works just like test 9.
	doc = """
	User-agent: Googlebot
	Disallow: /

	User-agent: Googlebot-Mobile
	Allow: /
	"""

	good = []
	bad = ['/something.jpg']

	RobotTest(9, doc, good, bad, agent="Googlebot")

	good = []
	bad = ['/something.jpg']

	RobotTest(10, doc, good, bad, agent="Googlebot-Mobile")

	# 11. Get the order correct.
	doc = """
	User-agent: Googlebot-Mobile
	Allow: /

	User-agent: Googlebot
	Disallow: /
	"""

	good = []
	bad = ['/something.jpg']

	RobotTest(11, doc, good, bad, agent="Googlebot")

	good = ['/something.jpg']
	bad = []

	RobotTest(12, doc, good, bad, agent="Googlebot-Mobile")


	# 13. Google also got the order wrong in #8. You need to specify the
	# URLs from more specific to more general.
	doc = """
	User-agent: Googlebot
	Allow: /folder1/myfile.html
	Disallow: /folder1/
	"""

	good = ['/folder1/myfile.html']
	bad = ['/folder1/anotherfile.html']

	RobotTest(13, doc, good, bad, agent="googlebot")


	# 14. For issue #6325 (query string support)
	doc = """
	User-agent: *
	Disallow: /some/path?name=value
	"""

	good = ['/some/path']
	bad = ['/some/path?name=value']

	RobotTest(14, doc, good, bad)

	# 15. For issue #4108 (obey first * entry)
	doc = """
	User-agent: *
	Disallow: /some/path

	User-agent: *
	Disallow: /another/path
	"""

	good = ['/another/path']
	bad = ['/some/path']

	RobotTest(15, doc, good, bad)


	class NetworkTestCase(unittest.TestCase):

	def testPasswordProtectedSite(self):
	test_support.requires('network')
	with test_support.transient_internet('mueblesmoraleda.com'):
	url = 'http://mueblesmoraleda.com'
	parser = robotparser.RobotFileParser()
	parser.set_url(url)
	try:
	parser.read()
	except IOError:
	self.skipTest('%s is unavailable' % url)
	self.assertEqual(parser.can_fetch("*", url+"/robots.txt"), False)

	def testPythonOrg(self):
	test_support.requires('network')
	with test_support.transient_internet('www.python.org'):
	parser = robotparser.RobotFileParser(
	"http://www.python.org/robots.txt")
	parser.read()
	self.assertTrue(
	parser.can_fetch("*", "http://www.python.org/robots.txt"))


	def test_main():
	test_support.run_unittest(tests)
	test_support.run_unittest(NetworkTestCase)

	if __name__=='__main__':
	test_support.verbose = 1
	test_main()