Lib/robotparser.py - platform/external/python/cpython2 - Git at Google

 """ robotparser.py

     Copyright (C) 2000  Bastian Kleineidam

     You can choose between two licenses when using this package:
     1) GNU GPLv2
     2) PSF license for Python 2.2

     The robots.txt Exclusion Protocol is implemented as specified in
     http://www.robotstxt.org/norobots-rfc.txt

 """
 import urlparse
 import urllib

 __all__ = ["RobotFileParser"]


 class RobotFileParser:
     """ This class provides a set of methods to read, parse and answer
     questions about a single robots.txt file.

     """

     def __init__(self, url=''):
         self.entries = []
         self.default_entry = None
         self.disallow_all = False
         self.allow_all = False
         self.set_url(url)
         self.last_checked = 0

     def mtime(self):
         """Returns the time the robots.txt file was last fetched.

         This is useful for long-running web spiders that need to
         check for new robots.txt files periodically.

         """
         return self.last_checked

     def modified(self):
         """Sets the time the robots.txt file was last fetched to the
         current time.

         """
         import time
         self.last_checked = time.time()

     def set_url(self, url):
         """Sets the URL referring to a robots.txt file."""
         self.url = url
         self.host, self.path = urlparse.urlparse(url)[1:3]

     def read(self):
         """Reads the robots.txt URL and feeds it to the parser."""
         opener = URLopener()
         f = opener.open(self.url)
         lines = [line.strip() for line in f]
         f.close()
         self.errcode = opener.errcode
         if self.errcode in (401, 403):
             self.disallow_all = True
         elif self.errcode >= 400 and self.errcode < 500:
             self.allow_all = True
         elif self.errcode == 200 and lines:
             self.parse(lines)

     def _add_entry(self, entry):
         if "*" in entry.useragents:
             # the default entry is considered last
             if self.default_entry is None:
                 # the first default entry wins
                 self.default_entry = entry
         else:
             self.entries.append(entry)

     def parse(self, lines):
         """parse the input lines from a robots.txt file.
            We allow that a user-agent: line is not preceded by
            one or more blank lines."""
         # states:
         #   0: start state
         #   1: saw user-agent line
         #   2: saw an allow or disallow line
         state = 0
         linenumber = 0
         entry = Entry()

         self.modified()
         for line in lines:
             linenumber += 1
             if not line:
                 if state == 1:
                     entry = Entry()
                     state = 0
                 elif state == 2:
                     self._add_entry(entry)
                     entry = Entry()
                     state = 0
             # remove optional comment and strip line
             i = line.find('#')
             if i >= 0:
                 line = line[:i]
             line = line.strip()
             if not line:
                 continue
             line = line.split(':', 1)
             if len(line) == 2:
                 line[0] = line[0].strip().lower()
                 line[1] = urllib.unquote(line[1].strip())
                 if line[0] == "user-agent":
                     if state == 2:
                         self._add_entry(entry)
                         entry = Entry()
                     entry.useragents.append(line[1])
                     state = 1
                 elif line[0] == "disallow":
                     if state != 0:
                         entry.rulelines.append(RuleLine(line[1], False))
                         state = 2
                 elif line[0] == "allow":
                     if state != 0:
                         entry.rulelines.append(RuleLine(line[1], True))
                         state = 2
         if state == 2:
             self._add_entry(entry)


     def can_fetch(self, useragent, url):
         """using the parsed robots.txt decide if useragent can fetch url"""
         if self.disallow_all:
             return False
         if self.allow_all:
             return True

         # Until the robots.txt file has been read or found not
         # to exist, we must assume that no url is allowable.
         # This prevents false positives when a user erroneously
         # calls can_fetch() before calling read().
         if not self.last_checked:
             return False

         # search for given user agent matches
         # the first match counts
         parsed_url = urlparse.urlparse(urllib.unquote(url))
         url = urlparse.urlunparse(('', '', parsed_url.path,
             parsed_url.params, parsed_url.query, parsed_url.fragment))
         url = urllib.quote(url)
         if not url:
             url = "/"
         for entry in self.entries:
             if entry.applies_to(useragent):
                 return entry.allowance(url)
         # try the default entry last
         if self.default_entry:
             return self.default_entry.allowance(url)
         # agent not found ==> access granted
         return True


     def __str__(self):
         entries = self.entries
         if self.default_entry is not None:
             entries = entries + [self.default_entry]
         return '\n'.join(map(str, entries)) + '\n'


 class RuleLine:
     """A rule line is a single "Allow:" (allowance==True) or "Disallow:"
        (allowance==False) followed by a path."""
     def __init__(self, path, allowance):
         if path == '' and not allowance:
             # an empty value means allow all
             allowance = True
         path = urlparse.urlunparse(urlparse.urlparse(path))
         self.path = urllib.quote(path)
         self.allowance = allowance

     def applies_to(self, filename):
         return self.path == "*" or filename.startswith(self.path)

     def __str__(self):
         return (self.allowance and "Allow" or "Disallow") + ": " + self.path


 class Entry:
     """An entry has one or more user-agents and zero or more rulelines"""
     def __init__(self):
         self.useragents = []
         self.rulelines = []

     def __str__(self):
         ret = []
         for agent in self.useragents:
             ret.extend(["User-agent: ", agent, "\n"])
         for line in self.rulelines:
             ret.extend([str(line), "\n"])
         return ''.join(ret)

     def applies_to(self, useragent):
         """check if this entry applies to the specified agent"""
         # split the name token and make it lower case
         useragent = useragent.split("/")[0].lower()
         for agent in self.useragents:
             if agent == '*':
                 # we have the catch-all agent
                 return True
             agent = agent.lower()
             if agent in useragent:
                 return True
         return False

     def allowance(self, filename):
         """Preconditions:
         - our agent applies to this entry
         - filename is URL decoded"""
         for line in self.rulelines:
             if line.applies_to(filename):
                 return line.allowance
         return True

 class URLopener(urllib.FancyURLopener):
     def __init__(self, *args):
         urllib.FancyURLopener.__init__(self, *args)
         self.errcode = 200

     def prompt_user_passwd(self, host, realm):
         ## If robots.txt file is accessible only with a password,
         ## we act as if the file wasn't there.
         return None, None

     def http_error_default(self, url, fp, errcode, errmsg, headers):
         self.errcode = errcode
         return urllib.FancyURLopener.http_error_default(self, url, fp, errcode,
                                                         errmsg, headers)
	""" robotparser.py

	Copyright (C) 2000 Bastian Kleineidam

	You can choose between two licenses when using this package:
	1) GNU GPLv2
	2) PSF license for Python 2.2

	The robots.txt Exclusion Protocol is implemented as specified in
	http://www.robotstxt.org/norobots-rfc.txt

	"""
	import urlparse
	import urllib

	__all__ = ["RobotFileParser"]


	class RobotFileParser:
	""" This class provides a set of methods to read, parse and answer
	questions about a single robots.txt file.

	"""

	def __init__(self, url=''):
	self.entries = []
	self.default_entry = None
	self.disallow_all = False
	self.allow_all = False
	self.set_url(url)
	self.last_checked = 0

	def mtime(self):
	"""Returns the time the robots.txt file was last fetched.

	This is useful for long-running web spiders that need to
	check for new robots.txt files periodically.

	"""
	return self.last_checked

	def modified(self):
	"""Sets the time the robots.txt file was last fetched to the
	current time.

	"""
	import time
	self.last_checked = time.time()

	def set_url(self, url):
	"""Sets the URL referring to a robots.txt file."""
	self.url = url
	self.host, self.path = urlparse.urlparse(url)[1:3]

	def read(self):
	"""Reads the robots.txt URL and feeds it to the parser."""
	opener = URLopener()
	f = opener.open(self.url)
	lines = [line.strip() for line in f]
	f.close()
	self.errcode = opener.errcode
	if self.errcode in (401, 403):
	self.disallow_all = True
	elif self.errcode >= 400 and self.errcode < 500:
	self.allow_all = True
	elif self.errcode == 200 and lines:
	self.parse(lines)

	def _add_entry(self, entry):
	if "*" in entry.useragents:
	# the default entry is considered last
	if self.default_entry is None:
	# the first default entry wins
	self.default_entry = entry
	else:
	self.entries.append(entry)

	def parse(self, lines):
	"""parse the input lines from a robots.txt file.
	We allow that a user-agent: line is not preceded by
	one or more blank lines."""
	# states:
	# 0: start state
	# 1: saw user-agent line
	# 2: saw an allow or disallow line
	state = 0
	linenumber = 0
	entry = Entry()

	self.modified()
	for line in lines:
	linenumber += 1
	if not line:
	if state == 1:
	entry = Entry()
	state = 0
	elif state == 2:
	self._add_entry(entry)
	entry = Entry()
	state = 0
	# remove optional comment and strip line
	i = line.find('#')
	if i >= 0:
	line = line[:i]
	line = line.strip()
	if not line:
	continue
	line = line.split(':', 1)
	if len(line) == 2:
	line[0] = line[0].strip().lower()
	line[1] = urllib.unquote(line[1].strip())
	if line[0] == "user-agent":
	if state == 2:
	self._add_entry(entry)
	entry = Entry()
	entry.useragents.append(line[1])
	state = 1
	elif line[0] == "disallow":
	if state != 0:
	entry.rulelines.append(RuleLine(line[1], False))
	state = 2
	elif line[0] == "allow":
	if state != 0:
	entry.rulelines.append(RuleLine(line[1], True))
	state = 2
	if state == 2:
	self._add_entry(entry)


	def can_fetch(self, useragent, url):
	"""using the parsed robots.txt decide if useragent can fetch url"""
	if self.disallow_all:
	return False
	if self.allow_all:
	return True

	# Until the robots.txt file has been read or found not
	# to exist, we must assume that no url is allowable.
	# This prevents false positives when a user erroneously
	# calls can_fetch() before calling read().
	if not self.last_checked:
	return False

	# search for given user agent matches
	# the first match counts
	parsed_url = urlparse.urlparse(urllib.unquote(url))
	url = urlparse.urlunparse(('', '', parsed_url.path,
	parsed_url.params, parsed_url.query, parsed_url.fragment))
	url = urllib.quote(url)
	if not url:
	url = "/"
	for entry in self.entries:
	if entry.applies_to(useragent):
	return entry.allowance(url)
	# try the default entry last
	if self.default_entry:
	return self.default_entry.allowance(url)
	# agent not found ==> access granted
	return True


	def __str__(self):
	entries = self.entries
	if self.default_entry is not None:
	entries = entries + [self.default_entry]
	return '\n'.join(map(str, entries)) + '\n'


	class RuleLine:
	"""A rule line is a single "Allow:" (allowance==True) or "Disallow:"
	(allowance==False) followed by a path."""
	def __init__(self, path, allowance):
	if path == '' and not allowance:
	# an empty value means allow all
	allowance = True
	path = urlparse.urlunparse(urlparse.urlparse(path))
	self.path = urllib.quote(path)
	self.allowance = allowance

	def applies_to(self, filename):
	return self.path == "*" or filename.startswith(self.path)

	def __str__(self):
	return (self.allowance and "Allow" or "Disallow") + ": " + self.path


	class Entry:
	"""An entry has one or more user-agents and zero or more rulelines"""
	def __init__(self):
	self.useragents = []
	self.rulelines = []

	def __str__(self):
	ret = []
	for agent in self.useragents:
	ret.extend(["User-agent: ", agent, "\n"])
	for line in self.rulelines:
	ret.extend([str(line), "\n"])
	return ''.join(ret)

	def applies_to(self, useragent):
	"""check if this entry applies to the specified agent"""
	# split the name token and make it lower case
	useragent = useragent.split("/")[0].lower()
	for agent in self.useragents:
	if agent == '*':
	# we have the catch-all agent
	return True
	agent = agent.lower()
	if agent in useragent:
	return True
	return False

	def allowance(self, filename):
	"""Preconditions:
	- our agent applies to this entry
	- filename is URL decoded"""
	for line in self.rulelines:
	if line.applies_to(filename):
	return line.allowance
	return True

	class URLopener(urllib.FancyURLopener):
	def __init__(self, *args):
	urllib.FancyURLopener.__init__(self, *args)
	self.errcode = 200

	def prompt_user_passwd(self, host, realm):
	## If robots.txt file is accessible only with a password,
	## we act as if the file wasn't there.
	return None, None

	def http_error_default(self, url, fp, errcode, errmsg, headers):
	self.errcode = errcode
	return urllib.FancyURLopener.http_error_default(self, url, fp, errcode,
	errmsg, headers)