Analyzing chatGPT Robots.txt Blocks

Publish date: May 1, 2024
Last updated: May 15, 2024

Tags:

How to Analyze Websites for chatGPT Robots Block

You will need to source lists of URLs/websites which can be found here:

Pre-Req Questions:

Should you block chatGPT from access website content?
Does sensitive copyright information need protection?
Are competitors blocking chatGPT?
Do I care about AI-search powered traffic?


import requests
import json
from urllib.parse import urlparse

def fetch_and_parse_robots_to_json(url):
    """
    Fetches a robots.txt file from the specified URL and parses it into a JSON structure.
    """
    try:
        response = requests.get(url + "/robots.txt")
        response.raise_for_status()
    except requests.RequestException as e:
        return {"error": str(e)}

    robots_json = {}
    current_user_agent = None

    for line in response.text.split('\n'):
        if line.startswith('User-agent:'):
            current_user_agent = line.split(':', 1)[1].strip()
            robots_json[current_user_agent] = {"Disallow": [], "Allow": []}
        elif line.startswith('Disallow:'):
            path = line.split(':', 1)[1].strip()
            robots_json[current_user_agent]["Disallow"].append(path)
        elif line.startswith('Allow:'):
            path = line.split(':', 1)[1].strip()
            robots_json[current_user_agent]["Allow"].append(path)

    return robots_json

def process_urls(file_path):
    with open(file_path, 'r') as file:
        for url in file:
            url = url.strip()
            if url:
                print(f"Processing {url}")
                robots_json = fetch_and_parse_robots_to_json(url)
                pretty_json = json.dumps(robots_json, indent=4)
                
                # Extract domain name for the filename
                domain_name = urlparse(url).netloc.replace('www.', '')
                filename = f"robots_{domain_name}.json"
                
                with open(filename, 'w') as output_file:
                    output_file.write(pretty_json)
                
                print(f"JSON data written to {filename}")

# Example usage - replace 'urls.txt' with your file containing URLs
process_urls('urls.txt')

#outputs to terminal

Processing https://www.google.com
JSON data written to robots_google.com.json
Processing https://www.youtube.com
JSON data written to robots_youtube.com.json
Processing https://www.reddit.com
JSON data written to robots_reddit.com.json
Processing https://www.facebook.com
JSON data written to robots_facebook.com.json
Processing https://www.amazon.com
JSON data written to robots_amazon.com.json
Processing https://duckduckgo.com
JSON data written to robots_duckduckgo.com.json

#file outputs

{
    "*": {
        "Disallow": [
            "/exec/obidos/account-access-login",
            "/exec/obidos/change-style",
            "/exec/obidos/flex-sign-in",
            "/exec/obidos/handle-buy-box",
            "/exec/obidos/tg/cm/member/",
            "/gp/aw/help/id=sss",
            "/gp/cart",
            "/gp/flex",
            "/gp/product/e-mail-friend",
            "/gp/product/product-availability",
            "/gp/product/rate-this-item",
            "/gp/sign-in",
            "/gp/reader",
            "/gp/sitbv3/reader",
            "/gp/richpub/syltguides/create",
            "/gp/gfix",
            "/gp/associations/wizard.html",
            "/gp/dmusic/order",
            "/gp/legacy-handle-buy-box.html",
            "/gp/aws/ssop",
            "/gp/yourstore",
            "/gp/gift-central/organizer/add-wishlist",
            "/gp/vote",
            "/gp/voting/",
            "/gp/music/wma-pop-up",
            "/gp/customer-images",
            "/gp/richpub/listmania/createpipeline",
            "/gp/content-form",
            "/gp/pdp/invitation/invite",
            "/gp/customer-reviews/common/du",
            "/gp/customer-reviews/write-a-review.html",
            "/gp/associations/wizard.html",
            "/gp/music/clipserve",
            "/gp/customer-media/upload",
            "/gp/history",
            "/gp/item-dispatch",
            "/gp/dmusic/order/handle-buy-box.html",
            "/gp/recsradio",
            "/gp/slredirect",
            "/dp/shipping/",
            "/dp/twister-update/",
            "/dp/manual-submit/",
            "/dp/e-mail-friend/",
            "/dp/product-availability/",
            "/dp/rate-this-item/",
            "/gp/registry/wishlist/*/reserve",
            "/gp/structured-ratings/actions/get-experience.html",
            "/gp/twitter/",
            "/ap/signin",
            "/gp/registry/wishlist/",
            "/wishlist/",
            "/gp/wishlist/",
            "/registry/wishlist/",
            "/review/common/du",
            "/gp/registry/search.html",
            "/product-reviews/B0069IY63Y",
            "/gp/orc/rml/",
            "*/gcrnsts",
            "/gp/gc/widget",
            "/gp/dmusic/mp3/player",
            "/gp/entity-alert/external",
            "/gp/customer-reviews/dynamic/sims-box",
            "/review/dynamic/sims-box",
            "/gp/redirect.html",
            "/gp/twister/ajaxv2",
            "/ss/twister/ajax",
            "/b?*node=7454917011",
            "/b?*node=7454927011",
            "/b?*node=7454939011",
            "/b?*node=7454898011",
            "/gp/customer-media/actions/delete/",
            "/gp/customer-media/actions/edit-caption/",
            "/gp/dmusic/",
            "/gp/offer-listing/",
            "/b?*node=9052533011",
            "/lm/R1XIHQVKXSKBNJ",
            "/lm/R3HQ5WJSZK6QSO",
            "/surprise/",
            "/local/ajax/",
            "*/B00M3E1NYI",
            "*/B00M3E1Q5Y",
            "*/B00M3E1TOM",
            "*/B00M3E1WYO",
            "*/B00M3E204K",
            "*/B00M3E236A",
            "*/B00M3E260I",
            "*/B00M3E28WO",
            "*/B00M3E2BC6",
            "*/B00M3E2DPQ",
            "*/B00M3E2GU8",
            "*/B00M3E2J14",
            "*/B00M3E2LOE",
            "*/B00M3E1HJY",
            "/gp/socialmedia/giveaways",
            "/gp/b2b-rd",
            "/gp/aw/so.html",
            "/gp/rentallist",
            "/gp/video/dvd-rental/settings",
            "/gp/rl/settings",
            "/gp/video/settings",
            "/gp/video/library",
            "/gp/video/watchlist",
            "/reviews/iframe",
            "/gp/switch-language",
            "/ga/p/",
            "/gp/profile/",
            "/giveaway/host/setup/",
            "/ss/customer-reviews/lighthouse/",
            "/ospublishing/story/*",
            "/gp/aw/ol/",
            "/gp/promotion/",
            "/hz/leaderboard/top-reviewers/",
            "/creatorhub",
            "/creatorhub/*",
            "/slp/s$",
            "/-/",
            "/hz/help/contact/*/message/$",
            "/gp/aw/shoppingAids/",
            "/rss/people/*/reviews",
            "/gp/pdp/rss/*/reviews",
            "/gp/cdp/member-reviews/",
            "/gp/aw/cr/",
            "*/sim/B001132UEE",
            "/gp/aag",
            "/gp/pdp/profile/",
            "/gp/help/customer/express/c2c/",
            "/slp/*/b$",
            "/hz/contact-us/ajax/initiate-trusted-contact/",
            "/gp/video/api",
            "/hp/video/api",
            "/gp/video/mystuff",
            "/hp/video/mystuff",
            "/gp/video/profiles",
            "/hp/video/profiles"
        ],
        "Allow": [
            "/wishlist/universal*",
            "/wishlist/vendor-button*",
            "/wishlist/get-button*",
            "/gp/wishlist/universal*",
            "/gp/wishlist/vendor-button*",
            "/gp/wishlist/ipad-install*",
            "/gp/dmusic/promotions/PrimeMusic",
            "/gp/dmusic/promotions/AmazonMusicUnlimited",
            "/-/es/",
            "/-/en$",
            "/-/zh_TW/",
            "/-/zh_TW$",
            "/-/he/",
            "/-/he$",
            "/gp/offer-listing/B000",
            "/gp/offer-listing/9000",
            "/gp/aag/main?*seller=ABVFEJU8LS620"
        ]
    },
    "EtaoSpider": {
        "Disallow": [
            "/"
        ],
        "Allow": []
    },
    "GPTBot": {
        "Disallow": [
            "/"
        ],
        "Allow": []
    },
    "CCBot": {
        "Disallow": [
            "/"
        ],
        "Allow": []
    }
}