UserAgent Class¶
Source code in pyrobotstxt/__init__.py
class UserAgent:
def __init__(self, ua_name="*", crawl_delay=0):
"""Initialize UserAgent objet with a user-agent name and crawl delay varible.
Args:
ua_name (str, optional): name of the user-agent. Defaults to "*".
crawl_delay (int, optional): crawl delay value for user agent/bots. Defaults to 0.
"""
self.user_agent_name = ua_name
self.crawl_delay = crawl_delay
self.sitemaps = [] # lists of sitemap for current UserAgent
self.allowed = [] # lists of Allowed Items for current UserAgent
self.disallowed = [] # lists of Disallowed Items for current UserAgent
self.content = "" # consolidate content for robots.txt file
def add_allow(self, allow_items, unique=True, comments=""):
"""Add allowed items/pages/slugs to current User Agent.
Args:
allow_items (str, list): single item or list of items allowed for current user agnet.
unique (bool, optional): If True duplicate item stripped to single value. Defaults to True.
comments (str, optional): Any comments for added value for human readability. Defaults to "".
"""
if isinstance(allow_items, str):
allow_items = [allow_items]
if not isinstance(allow_items, list):
print("not supported", type(allow_items)) # raise exception
raise TypeError
else:
self.allowed += allow_items
if unique:
self.allowed = list(set(self.allowed))
def remove_allow(self, allow_item):
"""Remove any previously added allowed item from allowed list.
Args:
allow_item (str, list): item(s) to be removed.
"""
if allow_item in self.allowed:
self.allowed -= [allow_item]
def add_disallow(self, disallow_items, unique=True, comments=""):
"""Add disallowed items/pages/slugs to current User Agent.
Args:
disallow_items (str, list): single item or list of items disallowed for current user agnet.
unique (bool, optional): If True duplicate item stripped to single value. Defaults to True.
comments (str, optional): Any comments for added value for human readability. Defaults to "".
"""
if isinstance(disallow_items, str):
disallow_items = [disallow_items]
if not isinstance(disallow_items, list):
print("not supported", type(disallow_items)) # raise exception
raise TypeError
else:
self.disallowed += disallow_items
if unique:
self.disallowed = list(set(self.disallowed))
def remove_disallow(self, disallow_item):
"""Remove any previously added disallowed item from allowed list.
Args:
disallow_item (str, list): item(s) to be removed.
"""
if disallow_item in self.disallowed:
self.disallowed -= [disallow_item]
def add_sitemap(self, site_map_path=None, comments=""):
"""add file path of sitemap to current user agent.
Args:
site_map_path (str): location of sitemap. Defaults to None.
comments (str): any comments to include with sitemap path. Defaults to "".
"""
if not site_map_path:
raise ValueError
self.sitemaps.append(site_map_path)
def remove_sitemap(self, site_map_path=None):
"""remove a sitemap from current user agent.
Args:
site_map_path (str): sitemap file path to be removed. Defaults to None.
"""
if site_map_path in self.sitemaps:
self.sitemaps -= [site_map_path]
def disallow_pagination(self, prefix="/page/*", comments=""):
"""Single function to disable pagination on a website using robots.txt file.
Args:
prefix (str, optional): prefix for pages (default - /page/). Defaults to "/page/*".
comments (str, optional): human readable comments for inclusion. Defaults to "".
"""
self.add_disallow(disallow_item=prefix, comments=comments)
def consolidate(self):
"""consolidate all the information (allowed, disallowed, sitemaps) in single text string."""
self.content = f"User-agent: {self.user_agent_name}"
# Support for including Crawl_delay. see feature request #1
if self.crawl_delay > 0:
self.content += f"\nCrawl-delay: {self.crawl_delay}\n"
if self.allowed:
self.content += "\n# Allowed Patterns\n"
self.content += "\n".join([f"Allow: {item}" for item in self.allowed])
if self.disallowed:
self.content += "\n\n# Disallowed Patterns\n"
self.content += "\n".join([f"Disallow: {item}" for item in self.disallowed])
if self.sitemaps:
self.content += "\n\n# Site Maps\n"
self.content += "\n".join([f"Sitemap: {item}" for item in self.sitemaps])
self.content += "\n\n"
__init__(ua_name='*', crawl_delay=0)
¶
Initialize UserAgent objet with a user-agent name and crawl delay varible.
Parameters: |
|
---|
pyrobotstxt/__init__.py
def __init__(self, ua_name="*", crawl_delay=0):
"""Initialize UserAgent objet with a user-agent name and crawl delay varible.
Args:
ua_name (str, optional): name of the user-agent. Defaults to "*".
crawl_delay (int, optional): crawl delay value for user agent/bots. Defaults to 0.
"""
self.user_agent_name = ua_name
self.crawl_delay = crawl_delay
self.sitemaps = [] # lists of sitemap for current UserAgent
self.allowed = [] # lists of Allowed Items for current UserAgent
self.disallowed = [] # lists of Disallowed Items for current UserAgent
self.content = "" # consolidate content for robots.txt file
add_allow(allow_items, unique=True, comments='')
¶
Add allowed items/pages/slugs to current User Agent.
Parameters: |
|
---|
pyrobotstxt/__init__.py
def add_allow(self, allow_items, unique=True, comments=""):
"""Add allowed items/pages/slugs to current User Agent.
Args:
allow_items (str, list): single item or list of items allowed for current user agnet.
unique (bool, optional): If True duplicate item stripped to single value. Defaults to True.
comments (str, optional): Any comments for added value for human readability. Defaults to "".
"""
if isinstance(allow_items, str):
allow_items = [allow_items]
if not isinstance(allow_items, list):
print("not supported", type(allow_items)) # raise exception
raise TypeError
else:
self.allowed += allow_items
if unique:
self.allowed = list(set(self.allowed))
add_disallow(disallow_items, unique=True, comments='')
¶
Add disallowed items/pages/slugs to current User Agent.
Parameters: |
|
---|
pyrobotstxt/__init__.py
def add_disallow(self, disallow_items, unique=True, comments=""):
"""Add disallowed items/pages/slugs to current User Agent.
Args:
disallow_items (str, list): single item or list of items disallowed for current user agnet.
unique (bool, optional): If True duplicate item stripped to single value. Defaults to True.
comments (str, optional): Any comments for added value for human readability. Defaults to "".
"""
if isinstance(disallow_items, str):
disallow_items = [disallow_items]
if not isinstance(disallow_items, list):
print("not supported", type(disallow_items)) # raise exception
raise TypeError
else:
self.disallowed += disallow_items
if unique:
self.disallowed = list(set(self.disallowed))
add_sitemap(site_map_path=None, comments='')
¶
add file path of sitemap to current user agent.
Parameters: |
|
---|
pyrobotstxt/__init__.py
def add_sitemap(self, site_map_path=None, comments=""):
"""add file path of sitemap to current user agent.
Args:
site_map_path (str): location of sitemap. Defaults to None.
comments (str): any comments to include with sitemap path. Defaults to "".
"""
if not site_map_path:
raise ValueError
self.sitemaps.append(site_map_path)
consolidate()
¶
consolidate all the information (allowed, disallowed, sitemaps) in single text string.
pyrobotstxt/__init__.py
def consolidate(self):
"""consolidate all the information (allowed, disallowed, sitemaps) in single text string."""
self.content = f"User-agent: {self.user_agent_name}"
# Support for including Crawl_delay. see feature request #1
if self.crawl_delay > 0:
self.content += f"\nCrawl-delay: {self.crawl_delay}\n"
if self.allowed:
self.content += "\n# Allowed Patterns\n"
self.content += "\n".join([f"Allow: {item}" for item in self.allowed])
if self.disallowed:
self.content += "\n\n# Disallowed Patterns\n"
self.content += "\n".join([f"Disallow: {item}" for item in self.disallowed])
if self.sitemaps:
self.content += "\n\n# Site Maps\n"
self.content += "\n".join([f"Sitemap: {item}" for item in self.sitemaps])
self.content += "\n\n"
disallow_pagination(prefix='/page/*', comments='')
¶
Single function to disable pagination on a website using robots.txt file.
Parameters: |
|
---|
pyrobotstxt/__init__.py
def disallow_pagination(self, prefix="/page/*", comments=""):
"""Single function to disable pagination on a website using robots.txt file.
Args:
prefix (str, optional): prefix for pages (default - /page/). Defaults to "/page/*".
comments (str, optional): human readable comments for inclusion. Defaults to "".
"""
self.add_disallow(disallow_item=prefix, comments=comments)
remove_allow(allow_item)
¶
Remove any previously added allowed item from allowed list.
Parameters: |
|
---|
pyrobotstxt/__init__.py
def remove_allow(self, allow_item):
"""Remove any previously added allowed item from allowed list.
Args:
allow_item (str, list): item(s) to be removed.
"""
if allow_item in self.allowed:
self.allowed -= [allow_item]
remove_disallow(disallow_item)
¶
Remove any previously added disallowed item from allowed list.
Parameters: |
|
---|
pyrobotstxt/__init__.py
def remove_disallow(self, disallow_item):
"""Remove any previously added disallowed item from allowed list.
Args:
disallow_item (str, list): item(s) to be removed.
"""
if disallow_item in self.disallowed:
self.disallowed -= [disallow_item]
remove_sitemap(site_map_path=None)
¶
remove a sitemap from current user agent.
Parameters: |
|
---|
pyrobotstxt/__init__.py
def remove_sitemap(self, site_map_path=None):
"""remove a sitemap from current user agent.
Args:
site_map_path (str): sitemap file path to be removed. Defaults to None.
"""
if site_map_path in self.sitemaps:
self.sitemaps -= [site_map_path]