Analyzing chatGPT Robots.txt Blocks
Publish date: May 1, 2024
Last updated: May 15, 2024
Last updated: May 15, 2024
How to Analyze Websites for chatGPT Robots Block
You will need to source lists of URLs/websites which can be found here:
- SEMrush Top Traffic Websites
- Github Website List
- Your Other Sourced Websites
Pre-Req Questions:
- Should you block chatGPT from access website content?
- Does sensitive copyright information need protection?
- Are competitors blocking chatGPT?
- Do I care about AI-search powered traffic?
import requests
import json
from urllib.parse import urlparse
def fetch_and_parse_robots_to_json(url):
"""
Fetches a robots.txt file from the specified URL and parses it into a JSON structure.
"""
try:
response = requests.get(url + "/robots.txt")
response.raise_for_status()
except requests.RequestException as e:
return {"error": str(e)}
robots_json = {}
current_user_agent = None
for line in response.text.split('\n'):
if line.startswith('User-agent:'):
current_user_agent = line.split(':', 1)[1].strip()
robots_json[current_user_agent] = {"Disallow": [], "Allow": []}
elif line.startswith('Disallow:'):
path = line.split(':', 1)[1].strip()
robots_json[current_user_agent]["Disallow"].append(path)
elif line.startswith('Allow:'):
path = line.split(':', 1)[1].strip()
robots_json[current_user_agent]["Allow"].append(path)
return robots_json
def process_urls(file_path):
with open(file_path, 'r') as file:
for url in file:
url = url.strip()
if url:
print(f"Processing {url}")
robots_json = fetch_and_parse_robots_to_json(url)
pretty_json = json.dumps(robots_json, indent=4)
# Extract domain name for the filename
domain_name = urlparse(url).netloc.replace('www.', '')
filename = f"robots_{domain_name}.json"
with open(filename, 'w') as output_file:
output_file.write(pretty_json)
print(f"JSON data written to {filename}")
# Example usage - replace 'urls.txt' with your file containing URLs
process_urls('urls.txt')
#outputs to terminal
Processing https://www.google.com
JSON data written to robots_google.com.json
Processing https://www.youtube.com
JSON data written to robots_youtube.com.json
Processing https://www.reddit.com
JSON data written to robots_reddit.com.json
Processing https://www.facebook.com
JSON data written to robots_facebook.com.json
Processing https://www.amazon.com
JSON data written to robots_amazon.com.json
Processing https://duckduckgo.com
JSON data written to robots_duckduckgo.com.json
#file outputs
{
"*": {
"Disallow": [
"/exec/obidos/account-access-login",
"/exec/obidos/change-style",
"/exec/obidos/flex-sign-in",
"/exec/obidos/handle-buy-box",
"/exec/obidos/tg/cm/member/",
"/gp/aw/help/id=sss",
"/gp/cart",
"/gp/flex",
"/gp/product/e-mail-friend",
"/gp/product/product-availability",
"/gp/product/rate-this-item",
"/gp/sign-in",
"/gp/reader",
"/gp/sitbv3/reader",
"/gp/richpub/syltguides/create",
"/gp/gfix",
"/gp/associations/wizard.html",
"/gp/dmusic/order",
"/gp/legacy-handle-buy-box.html",
"/gp/aws/ssop",
"/gp/yourstore",
"/gp/gift-central/organizer/add-wishlist",
"/gp/vote",
"/gp/voting/",
"/gp/music/wma-pop-up",
"/gp/customer-images",
"/gp/richpub/listmania/createpipeline",
"/gp/content-form",
"/gp/pdp/invitation/invite",
"/gp/customer-reviews/common/du",
"/gp/customer-reviews/write-a-review.html",
"/gp/associations/wizard.html",
"/gp/music/clipserve",
"/gp/customer-media/upload",
"/gp/history",
"/gp/item-dispatch",
"/gp/dmusic/order/handle-buy-box.html",
"/gp/recsradio",
"/gp/slredirect",
"/dp/shipping/",
"/dp/twister-update/",
"/dp/manual-submit/",
"/dp/e-mail-friend/",
"/dp/product-availability/",
"/dp/rate-this-item/",
"/gp/registry/wishlist/*/reserve",
"/gp/structured-ratings/actions/get-experience.html",
"/gp/twitter/",
"/ap/signin",
"/gp/registry/wishlist/",
"/wishlist/",
"/gp/wishlist/",
"/registry/wishlist/",
"/review/common/du",
"/gp/registry/search.html",
"/product-reviews/B0069IY63Y",
"/gp/orc/rml/",
"*/gcrnsts",
"/gp/gc/widget",
"/gp/dmusic/mp3/player",
"/gp/entity-alert/external",
"/gp/customer-reviews/dynamic/sims-box",
"/review/dynamic/sims-box",
"/gp/redirect.html",
"/gp/twister/ajaxv2",
"/ss/twister/ajax",
"/b?*node=7454917011",
"/b?*node=7454927011",
"/b?*node=7454939011",
"/b?*node=7454898011",
"/gp/customer-media/actions/delete/",
"/gp/customer-media/actions/edit-caption/",
"/gp/dmusic/",
"/gp/offer-listing/",
"/b?*node=9052533011",
"/lm/R1XIHQVKXSKBNJ",
"/lm/R3HQ5WJSZK6QSO",
"/surprise/",
"/local/ajax/",
"*/B00M3E1NYI",
"*/B00M3E1Q5Y",
"*/B00M3E1TOM",
"*/B00M3E1WYO",
"*/B00M3E204K",
"*/B00M3E236A",
"*/B00M3E260I",
"*/B00M3E28WO",
"*/B00M3E2BC6",
"*/B00M3E2DPQ",
"*/B00M3E2GU8",
"*/B00M3E2J14",
"*/B00M3E2LOE",
"*/B00M3E1HJY",
"/gp/socialmedia/giveaways",
"/gp/b2b-rd",
"/gp/aw/so.html",
"/gp/rentallist",
"/gp/video/dvd-rental/settings",
"/gp/rl/settings",
"/gp/video/settings",
"/gp/video/library",
"/gp/video/watchlist",
"/reviews/iframe",
"/gp/switch-language",
"/ga/p/",
"/gp/profile/",
"/giveaway/host/setup/",
"/ss/customer-reviews/lighthouse/",
"/ospublishing/story/*",
"/gp/aw/ol/",
"/gp/promotion/",
"/hz/leaderboard/top-reviewers/",
"/creatorhub",
"/creatorhub/*",
"/slp/s$",
"/-/",
"/hz/help/contact/*/message/$",
"/gp/aw/shoppingAids/",
"/rss/people/*/reviews",
"/gp/pdp/rss/*/reviews",
"/gp/cdp/member-reviews/",
"/gp/aw/cr/",
"*/sim/B001132UEE",
"/gp/aag",
"/gp/pdp/profile/",
"/gp/help/customer/express/c2c/",
"/slp/*/b$",
"/hz/contact-us/ajax/initiate-trusted-contact/",
"/gp/video/api",
"/hp/video/api",
"/gp/video/mystuff",
"/hp/video/mystuff",
"/gp/video/profiles",
"/hp/video/profiles"
],
"Allow": [
"/wishlist/universal*",
"/wishlist/vendor-button*",
"/wishlist/get-button*",
"/gp/wishlist/universal*",
"/gp/wishlist/vendor-button*",
"/gp/wishlist/ipad-install*",
"/gp/dmusic/promotions/PrimeMusic",
"/gp/dmusic/promotions/AmazonMusicUnlimited",
"/-/es/",
"/-/en$",
"/-/zh_TW/",
"/-/zh_TW$",
"/-/he/",
"/-/he$",
"/gp/offer-listing/B000",
"/gp/offer-listing/9000",
"/gp/aag/main?*seller=ABVFEJU8LS620"
]
},
"EtaoSpider": {
"Disallow": [
"/"
],
"Allow": []
},
"GPTBot": {
"Disallow": [
"/"
],
"Allow": []
},
"CCBot": {
"Disallow": [
"/"
],
"Allow": []
}
}
Disqus comments are disabled.