Context:
I’m working on a web scraping project in Python that involves scraping data from Shopee using the Shopee public API. The goal is to extract various product details from multiple Shopee shops. However, I’ve encountered several challenges and issues that I need assistance with.
Problem Description:
Rate Limiting: The code currently lacks effective rate limiting. I need help in implementing a proper rate-limiting mechanism to avoid getting blocked by Shopee’s API.
Proxy Management: While the code rotates through a list of proxies, I’m uncertain if it’s handling them correctly. I’m seeking guidance on best practices for proxy management within this code.
Error Handling: I want to improve error handling. Currently, the code handles ProxyError and ConnectionError. I’d like advice on handling additional exceptions, especially those related to Shopee API responses.
Scalability: As I plan to scale up my scraping efforts, I’m interested in understanding considerations for maintaining code efficiency and reliability.
Data Verification: After scraping, I need to ensure the accuracy and completeness of the scraped data. I’m looking for suggestions on data validation and verification techniques.
Any guidance, code improvements, or advice related to these issues would be highly appreciated. Thank you!
In my code below, I've implemented rotating proxies using the itertools.cycle method to avoid IP bans or restrictions when making requests to Shopee's API. However, I've been facing difficulties with this approach, and I'm unsure if rotating proxies are necessary for this specific scraping task. Is it essential to use rotating proxies when scraping data from Shopee's public API? Are there any rate limits or IP bans that necessitate their use? If rotating proxies are not necessary, what are the recommended alternatives for ensuring reliable and efficient scraping without risking IP-related issues?
import requests
import csv
from itertools import cycle
from time import sleep
from fake_useragent import UserAgent
from requests.exceptions import ProxyError, ConnectionError
CSV_FILE_NAME = 'product_data_v2.csv'
CSV_HEADER = [
'Name', 'Stock', 'Shop name', 'Shop Id', 'Item Id', 'Sold per month',
'Number of sales', 'Liked count', 'Variations name', 'Variations',
'Current price', 'Minimum price', 'Maximum price', 'Has lowest price guarantee(T/F)',
'Show discount', 'Raw discount', 'Current discount', 'Item rating',
'Rating count', 'Shopee verified (T/F)', 'Shopee official (T/F)',
'CC installment payment eligible(T/F)', 'None CC installment payment eligible(T/F)',
'Bundle deal id', 'Can use bundle deal(T/F)', 'Can use wholesale(T/F)',
'Is preffered plus seller(T/F)', 'Shop location', 'Shop rating', 'Can use COD(T/F)',
'Is service by shopee(T/F)', 'Flash sale sold count',
'Flash sale stock', 'Is mart (T/F)'
]
# Function to read seller Shopee IDs from a CSV file
def read_seller_shopee_id_from_csv(csv_file_name):
seller_shopee_ids = []
with open(csv_file_name, 'r') as csvfile:
reader = csv.reader(csvfile)
for row in reader:
seller_shopee_ids.extend(row)
return seller_shopee_ids
#user agent
user_agent = UserAgent()
#Read seller shopee IDs from CSV file
SELLER_SHOPEE_IDS = read_seller_shopee_id_from_csv('shope_id.csv')
# Function to read proxy IPs from a CSV file
def read_proxies_from_csv(csv_file_name):
proxy1_list = [] # Updated variable name
with open(csv_file_name, 'r') as csvfile:
reader = csv.reader(csvfile)
for row in reader:
proxy1_list.append(row[0]) # Assuming proxy data is in the first column
return proxy1_list # Updated variable name
proxy_list = read_proxies_from_csv('workingproxiesv2.csv')
proxy_pool = cycle(proxy_list)
#Rate limit variables(adjust as needed)
REQUESTS_PER_MINUTE = 60
TIME_BETWEEN_REQUESTS = 60 / REQUESTS_PER_MINUTE #Sleep for 60 seconds between requests
def make_api_request(seller_shopee_id, headers):
proxy = next(proxy_pool)
url = f'https://shopee.ph/api/v4/recommend/recommend?bundle=shop_page_product_tab_main&limit=999&offset=0§ion=shop_page_product_tab_main_sec&shopid={seller_shopee_id}'
try:
response = requests.get(url, headers=headers, proxies={"http": proxy, "https": proxy})
response.raise_for_status() # Raise an exception if the status code is not 200
if response.status_code == 200:
return response.json()['data']['sections'][0]['data']['item']
else:
print(f"Request for seller {seller_shopee_id} was not successful. Status Code: {response.status_code}")
return []
except (ProxyError, ConnectionError) as e:
print(f"Proxy error or connection error for {proxy}: {str(e)}")
return []
def extract_data(item):
tier_variations = item.get('tier_variations')
variation_name = tier_variations[0]['name'] if tier_variations else None
variation_options = tier_variations[0]['options'] if tier_variations else None
discount = item['discount'] if ('discount' in item and item['discount'] is not None) else 0
data_row = [
item.get('name'), item.get('stock'), item.get('shop_name'), item.get('shopid'),
item.get('itemid'), item.get('sold'), item.get('historical_sold'), item.get('liked_count'),
variation_name, variation_options,
item.get('price') / 100000 if item.get('price') else None,
item.get('price_min') / 100000 if item.get('price_min') else None,
item.get('price_max') / 100000 if item.get('price_max') else None,
item.get('has_lowest_price_guarantee'), item.get('show_discount'),
item.get('raw_discount'), discount,
item.get('item_rating').get('rating_star'),
item.get('item_rating').get('rating_count'),
item.get('shopee_verified'),
item.get('is_official_shop'),
item.get('is_cc_installment_payment_eligible'),
item.get('is_non_cc_installment_payment_eligible'),
item.get('bundle_deal_id'),
item.get('can_use_bundle_deal'),
item.get('can_use_wholesale'),
item.get('is_preferred_plus_seller'),
item.get('shop_location'),
item.get('shop_rating'),
item.get('can_use_cod'),
item.get('is_on_flash_sale'),
item.get('is_service_by_shopee'),
item.get('flash_sale_sold_count'),
item.get('is_mart')
]
return data_row
def write_data_to_csv(data, csv_file_name):
with open(csv_file_name, mode="a", newline="", encoding='utf-8') as csv_file:
csv_writer = csv.writer(csv_file)
csv_writer.writerow(CSV_HEADER)
csv_writer.writerows(data)
def main():
# Write the CSV header row outside the loop
with open(CSV_FILE_NAME, mode="w", newline="", encoding='utf-8') as csv_file:
csv_writer = csv.writer(csv_file)
csv_writer.writerow(CSV_HEADER)
for seller_shopee_id in SELLER_SHOPEE_IDS:
headers = {'User-Agent': user_agent.random}
data = make_api_request(seller_shopee_id, headers)
if data:
data_rows = [extract_data(item) for item in data]
write_data_to_csv(data_rows, CSV_FILE_NAME)
print(f'{len(data)} items have been written to {CSV_FILE_NAME}')
# Implement rate limiting
sleep(TIME_BETWEEN_REQUESTS)
if __name__ == "__main__":
main()
# Entry point of the script
if __name__ == "__main__":
main()
Please edit the question to limit it to a specific problem with enough detail to identify an adequate answer.
Bot