UserAgent Class

Source code in pyrobotstxt/__init__.py
class UserAgent:
    def __init__(self, ua_name="*", crawl_delay=0):
        """Initialize UserAgent objet with a user-agent name and crawl delay varible.

        Args:
            ua_name (str, optional): name of the user-agent. Defaults to "*".
            crawl_delay (int, optional): crawl delay value for user agent/bots. Defaults to 0.
        """
        self.user_agent_name = ua_name
        self.crawl_delay = crawl_delay
        self.sitemaps = []  # lists of sitemap for current UserAgent
        self.allowed = []  # lists of Allowed Items for current UserAgent
        self.disallowed = []  # lists of Disallowed Items for current UserAgent
        self.content = ""  # consolidate content for robots.txt file

    def add_allow(self, allow_items, unique=True, comments=""):
        """Add allowed items/pages/slugs to current User Agent.

        Args:
            allow_items (str, list): single item or list of items allowed for current user agnet.
            unique (bool, optional): If True duplicate item stripped to single value. Defaults to True.
            comments (str, optional): Any comments for added value for human readability. Defaults to "".
        """

        if isinstance(allow_items, str):
            allow_items = [allow_items]

        if not isinstance(allow_items, list):
            print("not supported", type(allow_items))  # raise exception
            raise TypeError
        else:
            self.allowed += allow_items
            if unique:
                self.allowed = list(set(self.allowed))

    def remove_allow(self, allow_item):
        """Remove any previously added allowed item from allowed list.

        Args:
            allow_item (str, list): item(s) to be removed.
        """

        if allow_item in self.allowed:
            self.allowed -= [allow_item]

    def add_disallow(self, disallow_items, unique=True, comments=""):
        """Add disallowed items/pages/slugs to current User Agent.

        Args:
            disallow_items (str, list): single item or list of items disallowed for current user agnet.
            unique (bool, optional): If True duplicate item stripped to single value. Defaults to True.
            comments (str, optional): Any comments for added value for human readability. Defaults to "".
        """
        if isinstance(disallow_items, str):
            disallow_items = [disallow_items]

        if not isinstance(disallow_items, list):
            print("not supported", type(disallow_items))  # raise exception
            raise TypeError
        else:
            self.disallowed += disallow_items
            if unique:
                self.disallowed = list(set(self.disallowed))

    def remove_disallow(self, disallow_item):
        """Remove any previously added disallowed item from allowed list.

        Args:
            disallow_item (str, list): item(s) to be removed.
        """

        if disallow_item in self.disallowed:
            self.disallowed -= [disallow_item]

    def add_sitemap(self, site_map_path=None, comments=""):
        """add file path of sitemap to current user agent.

        Args:
            site_map_path (str): location of sitemap. Defaults to None.
            comments (str): any comments to include with sitemap path. Defaults to "".
        """
        if not site_map_path:
            raise ValueError

        self.sitemaps.append(site_map_path)

    def remove_sitemap(self, site_map_path=None):
        """remove a sitemap from current user agent.

        Args:
            site_map_path (str): sitemap file path to be removed. Defaults to None.
        """

        if site_map_path in self.sitemaps:
            self.sitemaps -= [site_map_path]

    def disallow_pagination(self, prefix="/page/*", comments=""):
        """Single function to disable pagination on a website using robots.txt file.

        Args:
            prefix (str, optional): prefix for pages (default - /page/). Defaults to "/page/*".
            comments (str, optional): human readable comments for inclusion. Defaults to "".
        """
        self.add_disallow(disallow_item=prefix, comments=comments)

    def consolidate(self):
        """consolidate all the information (allowed, disallowed, sitemaps) in single text string."""

        self.content = f"User-agent: {self.user_agent_name}"

        # Support for including Crawl_delay. see feature request #1
        if self.crawl_delay > 0:
            self.content += f"\nCrawl-delay: {self.crawl_delay}\n"

        if self.allowed:
            self.content += "\n# Allowed Patterns\n"
            self.content += "\n".join([f"Allow: {item}" for item in self.allowed])

        if self.disallowed:
            self.content += "\n\n# Disallowed Patterns\n"
            self.content += "\n".join([f"Disallow: {item}" for item in self.disallowed])

        if self.sitemaps:
            self.content += "\n\n# Site Maps\n"
            self.content += "\n".join([f"Sitemap: {item}" for item in self.sitemaps])

        self.content += "\n\n"

__init__(ua_name='*', crawl_delay=0)

Initialize UserAgent objet with a user-agent name and crawl delay varible.

Parameters:
  • ua_name (str, default: '*' ) –

    name of the user-agent. Defaults to “*”.

  • crawl_delay (int, default: 0 ) –

    crawl delay value for user agent/bots. Defaults to 0.

pyrobotstxt/__init__.py
def __init__(self, ua_name="*", crawl_delay=0):
    """Initialize UserAgent objet with a user-agent name and crawl delay varible.

    Args:
        ua_name (str, optional): name of the user-agent. Defaults to "*".
        crawl_delay (int, optional): crawl delay value for user agent/bots. Defaults to 0.
    """
    self.user_agent_name = ua_name
    self.crawl_delay = crawl_delay
    self.sitemaps = []  # lists of sitemap for current UserAgent
    self.allowed = []  # lists of Allowed Items for current UserAgent
    self.disallowed = []  # lists of Disallowed Items for current UserAgent
    self.content = ""  # consolidate content for robots.txt file

add_allow(allow_items, unique=True, comments='')

Add allowed items/pages/slugs to current User Agent.

Parameters:
  • allow_items ((str, list)) –

    single item or list of items allowed for current user agnet.

  • unique (bool, default: True ) –

    If True duplicate item stripped to single value. Defaults to True.

  • comments (str, default: '' ) –

    Any comments for added value for human readability. Defaults to “”.

pyrobotstxt/__init__.py
def add_allow(self, allow_items, unique=True, comments=""):
    """Add allowed items/pages/slugs to current User Agent.

    Args:
        allow_items (str, list): single item or list of items allowed for current user agnet.
        unique (bool, optional): If True duplicate item stripped to single value. Defaults to True.
        comments (str, optional): Any comments for added value for human readability. Defaults to "".
    """

    if isinstance(allow_items, str):
        allow_items = [allow_items]

    if not isinstance(allow_items, list):
        print("not supported", type(allow_items))  # raise exception
        raise TypeError
    else:
        self.allowed += allow_items
        if unique:
            self.allowed = list(set(self.allowed))

add_disallow(disallow_items, unique=True, comments='')

Add disallowed items/pages/slugs to current User Agent.

Parameters:
  • disallow_items ((str, list)) –

    single item or list of items disallowed for current user agnet.

  • unique (bool, default: True ) –

    If True duplicate item stripped to single value. Defaults to True.

  • comments (str, default: '' ) –

    Any comments for added value for human readability. Defaults to “”.

pyrobotstxt/__init__.py
def add_disallow(self, disallow_items, unique=True, comments=""):
    """Add disallowed items/pages/slugs to current User Agent.

    Args:
        disallow_items (str, list): single item or list of items disallowed for current user agnet.
        unique (bool, optional): If True duplicate item stripped to single value. Defaults to True.
        comments (str, optional): Any comments for added value for human readability. Defaults to "".
    """
    if isinstance(disallow_items, str):
        disallow_items = [disallow_items]

    if not isinstance(disallow_items, list):
        print("not supported", type(disallow_items))  # raise exception
        raise TypeError
    else:
        self.disallowed += disallow_items
        if unique:
            self.disallowed = list(set(self.disallowed))

add_sitemap(site_map_path=None, comments='')

add file path of sitemap to current user agent.

Parameters:
  • site_map_path (str, default: None ) –

    location of sitemap. Defaults to None.

  • comments (str, default: '' ) –

    any comments to include with sitemap path. Defaults to “”.

pyrobotstxt/__init__.py
def add_sitemap(self, site_map_path=None, comments=""):
    """add file path of sitemap to current user agent.

    Args:
        site_map_path (str): location of sitemap. Defaults to None.
        comments (str): any comments to include with sitemap path. Defaults to "".
    """
    if not site_map_path:
        raise ValueError

    self.sitemaps.append(site_map_path)

consolidate()

consolidate all the information (allowed, disallowed, sitemaps) in single text string.

pyrobotstxt/__init__.py
def consolidate(self):
    """consolidate all the information (allowed, disallowed, sitemaps) in single text string."""

    self.content = f"User-agent: {self.user_agent_name}"

    # Support for including Crawl_delay. see feature request #1
    if self.crawl_delay > 0:
        self.content += f"\nCrawl-delay: {self.crawl_delay}\n"

    if self.allowed:
        self.content += "\n# Allowed Patterns\n"
        self.content += "\n".join([f"Allow: {item}" for item in self.allowed])

    if self.disallowed:
        self.content += "\n\n# Disallowed Patterns\n"
        self.content += "\n".join([f"Disallow: {item}" for item in self.disallowed])

    if self.sitemaps:
        self.content += "\n\n# Site Maps\n"
        self.content += "\n".join([f"Sitemap: {item}" for item in self.sitemaps])

    self.content += "\n\n"

disallow_pagination(prefix='/page/*', comments='')

Single function to disable pagination on a website using robots.txt file.

Parameters:
  • prefix (str, default: '/page/*' ) –

    prefix for pages (default - /page/). Defaults to “/page/*”.

  • comments (str, default: '' ) –

    human readable comments for inclusion. Defaults to “”.

pyrobotstxt/__init__.py
def disallow_pagination(self, prefix="/page/*", comments=""):
    """Single function to disable pagination on a website using robots.txt file.

    Args:
        prefix (str, optional): prefix for pages (default - /page/). Defaults to "/page/*".
        comments (str, optional): human readable comments for inclusion. Defaults to "".
    """
    self.add_disallow(disallow_item=prefix, comments=comments)

remove_allow(allow_item)

Remove any previously added allowed item from allowed list.

Parameters:
  • allow_item ((str, list)) –

    item(s) to be removed.

pyrobotstxt/__init__.py
def remove_allow(self, allow_item):
    """Remove any previously added allowed item from allowed list.

    Args:
        allow_item (str, list): item(s) to be removed.
    """

    if allow_item in self.allowed:
        self.allowed -= [allow_item]

remove_disallow(disallow_item)

Remove any previously added disallowed item from allowed list.

Parameters:
  • disallow_item ((str, list)) –

    item(s) to be removed.

pyrobotstxt/__init__.py
def remove_disallow(self, disallow_item):
    """Remove any previously added disallowed item from allowed list.

    Args:
        disallow_item (str, list): item(s) to be removed.
    """

    if disallow_item in self.disallowed:
        self.disallowed -= [disallow_item]

remove_sitemap(site_map_path=None)

remove a sitemap from current user agent.

Parameters:
  • site_map_path (str, default: None ) –

    sitemap file path to be removed. Defaults to None.

pyrobotstxt/__init__.py
def remove_sitemap(self, site_map_path=None):
    """remove a sitemap from current user agent.

    Args:
        site_map_path (str): sitemap file path to be removed. Defaults to None.
    """

    if site_map_path in self.sitemaps:
        self.sitemaps -= [site_map_path]