RobotsTxt Class

Source code in pyrobotstxt/__init__.py
class RobotsTxt:
    def __init__(self, version=""):
        """Intializes Robots.txt operations

        Args:
            version (str, optional): Version number (optional) for robots.txt. Defaults to "".
        """
        self.user_agents = []
        self.create_time = datetime.now()
        self.version = version
        self.image_branding = None
        self.header = ""  # message added to the start of the output file.
        self.footer = ""  # message added to the end of the output file.

    def read(self, robots_url):
        """Read a Remote Robots.txt file from a given URL

        If robots_txt is missing a robots.txt file extention then it will be automatically added.
        Parsing will only be carried out if robots_url returns a valid response object.

        Args:
            robots_url (str):  robots.txt url at a remote location.
        """

        self.create_time = datetime.now()
        robots_url = get_corrected_url(robots_url, "")
        response = get_remote_content(robots_url)

        if response.status_code < 400:
            for ua_item in response.text.split("User-agent:"):
                if ua_item:
                    ua_content_items = [
                        ua_split_item.strip()
                        for ua_split_item in ua_item.split("\n")
                        if ua_split_item
                    ]
                    if not ua_content_items[0].startswith("#"):
                        ua = UserAgent(ua_name=ua_content_items[0])
                        ua.add_allow(
                            [
                                it.split("Allow:")[-1]
                                for it in ua_content_items[1:]
                                if it.startswith("Allow:")
                            ]
                        )
                        ua.add_disallow(
                            [
                                it.split("Disallow:")[-1]
                                for it in ua_content_items[1:]
                                if it.startswith("Disallow:")
                            ]
                        )
                        # TODO: Comments are not included Yet
                        comment = [
                            it.split("# ")[-1]
                            for it in ua_content_items[1:]
                            if it.startswith("#")
                        ]

                        self.add_user_agent(ua=ua)

    def write(self, file_path="robots.txt"):
        """write robots.txt file at a given file_path location.

        Args:
            file_path (str, optional): location of robots.txt file. Defaults to "robots.txt".
        """

        with open(file_path, "w") as f:
            # include header
            if self.header:
                f.write(f"# {self.header}")

            # include user agents with consolidate text
            for ua in self.user_agents:
                ua.consolidate()
                f.write(ua.content)

            f.write("\n")

            # append ascii image, if available
            if self.image_branding:
                f.write(self.image_branding)

            # append footer message
            if self.footer:
                f.write(f"\n# {self.footer}")

    def include_header(self, message="", append_date=True):
        """include header message with/without creation date.

        Args:
            message (str, optional): header or header message. Defaults to "".
            append_date (bool, optional): Append date/time to the header. Defaults to True.
        """

        self.header = message

        if append_date:
            self.header += f"\n# Created on {self.create_time} using pyrobotstxt"

    def include_footer(self, message=""):
        """include footer message

        Args:
            message (str, optional): footer message. Defaults to "".
        """
        self.footer = message

    def include_image(self, image_path=None, desired_width=90):
        """includes ascii image provided at image_file

        Args:
            image_path (str): location of image file. Defaults to None.
            desired_width (int, optional): desired width of ASCII image. Defaults to 90(chars).
        """
        img = ImageAsASCII(image_path=image_path, desired_width=desired_width)
        img.map_to_ascii()
        self.image_branding = img.ascii_image

    def add_user_agent(self, ua):
        """Add/Append user agent to RobotsTxt

        Args:
            ua (UserAgent): user agent to be included in final robots.txt file.
        """
        self.user_agents.append(ua)

    def remove_user_agent(self, ua_name=""):
        """Remove user agent from RobotsTxt

        Args:
            ua_name (UserAgent): user agent to be removed from already included in robots.txt file.
        """
        self.user_agents -= [ua for ua in self.user_agents if ua.name == ua_name]

    @staticmethod
    def robots_name(crawl_bot):
        """Find robot name, if you know any keywrod about that crawl bot.

        Args:
            crawl_bot (str): description about the crawl bot. e.g. facebook

        Returns:
            (dict): all matching crawl bots with relevent information
        """
        return {
            robot: ROBOTS[robot]
            for robot in ROBOTS
            if crawl_bot.capitalize() in ROBOTS[robot]
        }

    @staticmethod
    def robots_details(crawl_bot):
        """Static Method to return details about any crawl bot.

        Args:
            crawl_bot (str): name of crawl bot

        Returns:
            (dict): information about all crawl bots matching to input string.
        """
        return {
            robot: ROBOTS[robot]
            for robot in ROBOTS
            if crawl_bot.lower() == robot.lower()
        }

__init__(version='')

Intializes Robots.txt operations

Parameters:
  • version (str, default: '' ) –

    Version number (optional) for robots.txt. Defaults to “”.

pyrobotstxt/__init__.py
def __init__(self, version=""):
    """Intializes Robots.txt operations

    Args:
        version (str, optional): Version number (optional) for robots.txt. Defaults to "".
    """
    self.user_agents = []
    self.create_time = datetime.now()
    self.version = version
    self.image_branding = None
    self.header = ""  # message added to the start of the output file.
    self.footer = ""  # message added to the end of the output file.

add_user_agent(ua)

Add/Append user agent to RobotsTxt

Parameters:
  • ua (UserAgent) –

    user agent to be included in final robots.txt file.

pyrobotstxt/__init__.py
def add_user_agent(self, ua):
    """Add/Append user agent to RobotsTxt

    Args:
        ua (UserAgent): user agent to be included in final robots.txt file.
    """
    self.user_agents.append(ua)

include footer message

Parameters:
  • message (str, default: '' ) –

    footer message. Defaults to “”.

pyrobotstxt/__init__.py
def include_footer(self, message=""):
    """include footer message

    Args:
        message (str, optional): footer message. Defaults to "".
    """
    self.footer = message

include_header(message='', append_date=True)

include header message with/without creation date.

Parameters:
  • message (str, default: '' ) –

    header or header message. Defaults to “”.

  • append_date (bool, default: True ) –

    Append date/time to the header. Defaults to True.

pyrobotstxt/__init__.py
def include_header(self, message="", append_date=True):
    """include header message with/without creation date.

    Args:
        message (str, optional): header or header message. Defaults to "".
        append_date (bool, optional): Append date/time to the header. Defaults to True.
    """

    self.header = message

    if append_date:
        self.header += f"\n# Created on {self.create_time} using pyrobotstxt"

include_image(image_path=None, desired_width=90)

includes ascii image provided at image_file

Parameters:
  • image_path (str, default: None ) –

    location of image file. Defaults to None.

  • desired_width (int, default: 90 ) –

    desired width of ASCII image. Defaults to 90(chars).

pyrobotstxt/__init__.py
def include_image(self, image_path=None, desired_width=90):
    """includes ascii image provided at image_file

    Args:
        image_path (str): location of image file. Defaults to None.
        desired_width (int, optional): desired width of ASCII image. Defaults to 90(chars).
    """
    img = ImageAsASCII(image_path=image_path, desired_width=desired_width)
    img.map_to_ascii()
    self.image_branding = img.ascii_image

read(robots_url)

Read a Remote Robots.txt file from a given URL

If robots_txt is missing a robots.txt file extention then it will be automatically added. Parsing will only be carried out if robots_url returns a valid response object.

Parameters:
  • robots_url (str) –

    robots.txt url at a remote location.

pyrobotstxt/__init__.py
def read(self, robots_url):
    """Read a Remote Robots.txt file from a given URL

    If robots_txt is missing a robots.txt file extention then it will be automatically added.
    Parsing will only be carried out if robots_url returns a valid response object.

    Args:
        robots_url (str):  robots.txt url at a remote location.
    """

    self.create_time = datetime.now()
    robots_url = get_corrected_url(robots_url, "")
    response = get_remote_content(robots_url)

    if response.status_code < 400:
        for ua_item in response.text.split("User-agent:"):
            if ua_item:
                ua_content_items = [
                    ua_split_item.strip()
                    for ua_split_item in ua_item.split("\n")
                    if ua_split_item
                ]
                if not ua_content_items[0].startswith("#"):
                    ua = UserAgent(ua_name=ua_content_items[0])
                    ua.add_allow(
                        [
                            it.split("Allow:")[-1]
                            for it in ua_content_items[1:]
                            if it.startswith("Allow:")
                        ]
                    )
                    ua.add_disallow(
                        [
                            it.split("Disallow:")[-1]
                            for it in ua_content_items[1:]
                            if it.startswith("Disallow:")
                        ]
                    )
                    # TODO: Comments are not included Yet
                    comment = [
                        it.split("# ")[-1]
                        for it in ua_content_items[1:]
                        if it.startswith("#")
                    ]

                    self.add_user_agent(ua=ua)

remove_user_agent(ua_name='')

Remove user agent from RobotsTxt

Parameters:
  • ua_name (UserAgent, default: '' ) –

    user agent to be removed from already included in robots.txt file.

pyrobotstxt/__init__.py
def remove_user_agent(self, ua_name=""):
    """Remove user agent from RobotsTxt

    Args:
        ua_name (UserAgent): user agent to be removed from already included in robots.txt file.
    """
    self.user_agents -= [ua for ua in self.user_agents if ua.name == ua_name]

robots_details(crawl_bot) staticmethod

Static Method to return details about any crawl bot.

Parameters:
  • crawl_bot (str) –

    name of crawl bot

Returns:
  • dict

    information about all crawl bots matching to input string.

pyrobotstxt/__init__.py
@staticmethod
def robots_details(crawl_bot):
    """Static Method to return details about any crawl bot.

    Args:
        crawl_bot (str): name of crawl bot

    Returns:
        (dict): information about all crawl bots matching to input string.
    """
    return {
        robot: ROBOTS[robot]
        for robot in ROBOTS
        if crawl_bot.lower() == robot.lower()
    }

robots_name(crawl_bot) staticmethod

Find robot name, if you know any keywrod about that crawl bot.

Parameters:
  • crawl_bot (str) –

    description about the crawl bot. e.g. facebook

Returns:
  • dict

    all matching crawl bots with relevent information

pyrobotstxt/__init__.py
@staticmethod
def robots_name(crawl_bot):
    """Find robot name, if you know any keywrod about that crawl bot.

    Args:
        crawl_bot (str): description about the crawl bot. e.g. facebook

    Returns:
        (dict): all matching crawl bots with relevent information
    """
    return {
        robot: ROBOTS[robot]
        for robot in ROBOTS
        if crawl_bot.capitalize() in ROBOTS[robot]
    }

write(file_path='robots.txt')

write robots.txt file at a given file_path location.

Parameters:
  • file_path (str, default: 'robots.txt' ) –

    location of robots.txt file. Defaults to “robots.txt”.

pyrobotstxt/__init__.py
def write(self, file_path="robots.txt"):
    """write robots.txt file at a given file_path location.

    Args:
        file_path (str, optional): location of robots.txt file. Defaults to "robots.txt".
    """

    with open(file_path, "w") as f:
        # include header
        if self.header:
            f.write(f"# {self.header}")

        # include user agents with consolidate text
        for ua in self.user_agents:
            ua.consolidate()
            f.write(ua.content)

        f.write("\n")

        # append ascii image, if available
        if self.image_branding:
            f.write(self.image_branding)

        # append footer message
        if self.footer:
            f.write(f"\n# {self.footer}")