Source code in pyrobotstxt/__init__.py
class RobotsTxt:
def __init__(self, version=""):
"""Intializes Robots.txt operations
Args:
version (str, optional): Version number (optional) for robots.txt. Defaults to "".
"""
self.user_agents = []
self.create_time = datetime.now()
self.version = version
self.image_branding = None
self.header = "" # message added to the start of the output file.
self.footer = "" # message added to the end of the output file.
def read(self, robots_url):
"""Read a Remote Robots.txt file from a given URL
If robots_txt is missing a robots.txt file extention then it will be automatically added.
Parsing will only be carried out if robots_url returns a valid response object.
Args:
robots_url (str): robots.txt url at a remote location.
"""
self.create_time = datetime.now()
robots_url = get_corrected_url(robots_url, "")
response = get_remote_content(robots_url)
if response.status_code < 400:
for ua_item in response.text.split("User-agent:"):
if ua_item:
ua_content_items = [
ua_split_item.strip()
for ua_split_item in ua_item.split("\n")
if ua_split_item
]
if not ua_content_items[0].startswith("#"):
ua = UserAgent(ua_name=ua_content_items[0])
ua.add_allow(
[
it.split("Allow:")[-1]
for it in ua_content_items[1:]
if it.startswith("Allow:")
]
)
ua.add_disallow(
[
it.split("Disallow:")[-1]
for it in ua_content_items[1:]
if it.startswith("Disallow:")
]
)
# TODO: Comments are not included Yet
comment = [
it.split("# ")[-1]
for it in ua_content_items[1:]
if it.startswith("#")
]
self.add_user_agent(ua=ua)
def write(self, file_path="robots.txt"):
"""write robots.txt file at a given file_path location.
Args:
file_path (str, optional): location of robots.txt file. Defaults to "robots.txt".
"""
with open(file_path, "w") as f:
# include header
if self.header:
f.write(f"# {self.header}")
# include user agents with consolidate text
for ua in self.user_agents:
ua.consolidate()
f.write(ua.content)
f.write("\n")
# append ascii image, if available
if self.image_branding:
f.write(self.image_branding)
# append footer message
if self.footer:
f.write(f"\n# {self.footer}")
def include_header(self, message="", append_date=True):
"""include header message with/without creation date.
Args:
message (str, optional): header or header message. Defaults to "".
append_date (bool, optional): Append date/time to the header. Defaults to True.
"""
self.header = message
if append_date:
self.header += f"\n# Created on {self.create_time} using pyrobotstxt"
def include_footer(self, message=""):
"""include footer message
Args:
message (str, optional): footer message. Defaults to "".
"""
self.footer = message
def include_image(self, image_path=None, desired_width=90):
"""includes ascii image provided at image_file
Args:
image_path (str): location of image file. Defaults to None.
desired_width (int, optional): desired width of ASCII image. Defaults to 90(chars).
"""
img = ImageAsASCII(image_path=image_path, desired_width=desired_width)
img.map_to_ascii()
self.image_branding = img.ascii_image
def add_user_agent(self, ua):
"""Add/Append user agent to RobotsTxt
Args:
ua (UserAgent): user agent to be included in final robots.txt file.
"""
self.user_agents.append(ua)
def remove_user_agent(self, ua_name=""):
"""Remove user agent from RobotsTxt
Args:
ua_name (UserAgent): user agent to be removed from already included in robots.txt file.
"""
self.user_agents -= [ua for ua in self.user_agents if ua.name == ua_name]
@staticmethod
def robots_name(crawl_bot):
"""Find robot name, if you know any keywrod about that crawl bot.
Args:
crawl_bot (str): description about the crawl bot. e.g. facebook
Returns:
(dict): all matching crawl bots with relevent information
"""
return {
robot: ROBOTS[robot]
for robot in ROBOTS
if crawl_bot.capitalize() in ROBOTS[robot]
}
@staticmethod
def robots_details(crawl_bot):
"""Static Method to return details about any crawl bot.
Args:
crawl_bot (str): name of crawl bot
Returns:
(dict): information about all crawl bots matching to input string.
"""
return {
robot: ROBOTS[robot]
for robot in ROBOTS
if crawl_bot.lower() == robot.lower()
}
__init__(version='')
Intializes Robots.txt operations
Parameters: |
-
version
(str , default:
''
)
–
Version number (optional) for robots.txt. Defaults to “”.
|
pyrobotstxt/__init__.py
def __init__(self, version=""):
"""Intializes Robots.txt operations
Args:
version (str, optional): Version number (optional) for robots.txt. Defaults to "".
"""
self.user_agents = []
self.create_time = datetime.now()
self.version = version
self.image_branding = None
self.header = "" # message added to the start of the output file.
self.footer = "" # message added to the end of the output file.
add_user_agent(ua)
Add/Append user agent to RobotsTxt
Parameters: |
-
ua
(UserAgent )
–
user agent to be included in final robots.txt file.
|
pyrobotstxt/__init__.py
def add_user_agent(self, ua):
"""Add/Append user agent to RobotsTxt
Args:
ua (UserAgent): user agent to be included in final robots.txt file.
"""
self.user_agents.append(ua)
include footer message
Parameters: |
-
message
(str , default:
''
)
–
footer message. Defaults to “”.
|
pyrobotstxt/__init__.py
def include_footer(self, message=""):
"""include footer message
Args:
message (str, optional): footer message. Defaults to "".
"""
self.footer = message
include header message with/without creation date.
Parameters: |
-
message
(str , default:
''
)
–
header or header message. Defaults to “”.
-
append_date
(bool , default:
True
)
–
Append date/time to the header. Defaults to True.
|
pyrobotstxt/__init__.py
def include_header(self, message="", append_date=True):
"""include header message with/without creation date.
Args:
message (str, optional): header or header message. Defaults to "".
append_date (bool, optional): Append date/time to the header. Defaults to True.
"""
self.header = message
if append_date:
self.header += f"\n# Created on {self.create_time} using pyrobotstxt"
include_image(image_path=None, desired_width=90)
includes ascii image provided at image_file
Parameters: |
-
image_path
(str , default:
None
)
–
location of image file. Defaults to None.
-
desired_width
(int , default:
90
)
–
desired width of ASCII image. Defaults to 90(chars).
|
pyrobotstxt/__init__.py
def include_image(self, image_path=None, desired_width=90):
"""includes ascii image provided at image_file
Args:
image_path (str): location of image file. Defaults to None.
desired_width (int, optional): desired width of ASCII image. Defaults to 90(chars).
"""
img = ImageAsASCII(image_path=image_path, desired_width=desired_width)
img.map_to_ascii()
self.image_branding = img.ascii_image
read(robots_url)
Read a Remote Robots.txt file from a given URL
If robots_txt is missing a robots.txt file extention then it will be automatically added.
Parsing will only be carried out if robots_url returns a valid response object.
Parameters: |
-
robots_url
(str )
–
robots.txt url at a remote location.
|
pyrobotstxt/__init__.py
def read(self, robots_url):
"""Read a Remote Robots.txt file from a given URL
If robots_txt is missing a robots.txt file extention then it will be automatically added.
Parsing will only be carried out if robots_url returns a valid response object.
Args:
robots_url (str): robots.txt url at a remote location.
"""
self.create_time = datetime.now()
robots_url = get_corrected_url(robots_url, "")
response = get_remote_content(robots_url)
if response.status_code < 400:
for ua_item in response.text.split("User-agent:"):
if ua_item:
ua_content_items = [
ua_split_item.strip()
for ua_split_item in ua_item.split("\n")
if ua_split_item
]
if not ua_content_items[0].startswith("#"):
ua = UserAgent(ua_name=ua_content_items[0])
ua.add_allow(
[
it.split("Allow:")[-1]
for it in ua_content_items[1:]
if it.startswith("Allow:")
]
)
ua.add_disallow(
[
it.split("Disallow:")[-1]
for it in ua_content_items[1:]
if it.startswith("Disallow:")
]
)
# TODO: Comments are not included Yet
comment = [
it.split("# ")[-1]
for it in ua_content_items[1:]
if it.startswith("#")
]
self.add_user_agent(ua=ua)
remove_user_agent(ua_name='')
Remove user agent from RobotsTxt
Parameters: |
-
ua_name
(UserAgent , default:
''
)
–
user agent to be removed from already included in robots.txt file.
|
pyrobotstxt/__init__.py
def remove_user_agent(self, ua_name=""):
"""Remove user agent from RobotsTxt
Args:
ua_name (UserAgent): user agent to be removed from already included in robots.txt file.
"""
self.user_agents -= [ua for ua in self.user_agents if ua.name == ua_name]
robots_details(crawl_bot)
staticmethod
Static Method to return details about any crawl bot.
Returns: |
-
dict
–
information about all crawl bots matching to input string.
|
pyrobotstxt/__init__.py
@staticmethod
def robots_details(crawl_bot):
"""Static Method to return details about any crawl bot.
Args:
crawl_bot (str): name of crawl bot
Returns:
(dict): information about all crawl bots matching to input string.
"""
return {
robot: ROBOTS[robot]
for robot in ROBOTS
if crawl_bot.lower() == robot.lower()
}
robots_name(crawl_bot)
staticmethod
Find robot name, if you know any keywrod about that crawl bot.
Parameters: |
-
crawl_bot
(str )
–
description about the crawl bot. e.g. facebook
|
Returns: |
-
dict
–
all matching crawl bots with relevent information
|
pyrobotstxt/__init__.py
@staticmethod
def robots_name(crawl_bot):
"""Find robot name, if you know any keywrod about that crawl bot.
Args:
crawl_bot (str): description about the crawl bot. e.g. facebook
Returns:
(dict): all matching crawl bots with relevent information
"""
return {
robot: ROBOTS[robot]
for robot in ROBOTS
if crawl_bot.capitalize() in ROBOTS[robot]
}
write(file_path='robots.txt')
write robots.txt file at a given file_path location.
Parameters: |
-
file_path
(str , default:
'robots.txt'
)
–
location of robots.txt file. Defaults to “robots.txt”.
|
pyrobotstxt/__init__.py
def write(self, file_path="robots.txt"):
"""write robots.txt file at a given file_path location.
Args:
file_path (str, optional): location of robots.txt file. Defaults to "robots.txt".
"""
with open(file_path, "w") as f:
# include header
if self.header:
f.write(f"# {self.header}")
# include user agents with consolidate text
for ua in self.user_agents:
ua.consolidate()
f.write(ua.content)
f.write("\n")
# append ascii image, if available
if self.image_branding:
f.write(self.image_branding)
# append footer message
if self.footer:
f.write(f"\n# {self.footer}")