MCX India Spider Data Extraction Script
MCX India Spider Data Extraction Script
# import pandas as pd
# import re
# import os
# import traceback
# import warnings
# from io import BytesIO
# from datetime import datetime
# from [Link] import urljoin
#
# from [Link] import TextResponse
#
# from src.helpers_scrapy.scrap_logger import ScrapperLogger
# from src.helpers_scrapy.global_helpers import update_scraping_status_on_success,
update_scraping_status_on_failure, \
# upload_to_s3_and_update_db
# from src.helpers_scrapy.proxy_retry_helper import ProxyRetryHelper
#
# [Link]("ignore", category=UserWarning, module="openpyxl")
#
# logger = ScrapperLogger(script_name="mcxindia_spider", source="www_mcxindia_com")
#
# settings = {
# 'BOT_NAME': 'w1atchlist_scrapper',
# 'ROBOTSTXT_OBEY': False,
# 'CONCURRENT_REQUESTS': 1,
# 'DOWNLOAD_DELAY': 3,
# 'COOKIES_ENABLED': True,
# 'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36
(KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
# }
#
#
# def clean_text(text):
# text_with_spaces = [Link](r'[^\w\s/\'\-&]', ' ', str(text))
# cleaned_data = [Link](r'\s+', ' ', text_with_spaces).strip()
# return cleaned_data if cleaned_data else ""
# class McxSpiderDataSpider([Link]):
# name = "mcxindia_spider"
# source_name = "www_mcxindia_com"
# source_id = None
#
# def __init__(self, source_dict=None, *args, **kwargs):
# try:
# super(McxSpiderDataSpider, self).__init__(*args, **kwargs)
# if source_dict and "source_id" in source_dict:
# self.source_id = source_dict.get("source_id")
# self.source_description = source_dict.get("source_description",
# "List of Surrendered/Ceased
Members") if source_dict else "List of Surrendered/Ceased Members"
# [Link](f"Initializing {[Link]} spider for
{self.source_name}",
# source_description=self.source_description)
# self.proxy_helper = ProxyRetryHelper(logger)
# self.data_df = None
# self.output_file_path = None
# self.source_dict = source_dict
# self.error_message = None
# [Link] = True
#
# # headers
# [Link] = {
# 'Accept':
'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/
webp,image/apng,*/*;q=0.8',
# 'Accept-Language': 'en-US,en;q=0.9',
# 'Connection': 'keep-alive',
# 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)
AppleWebKit/537.36 (KHTML, like Gecko) Chrome/[Link] Safari/537.36'
# }
# [Link] = {
# '_gid': 'GA1.2.1382501654.1744804578',
# '_ga_8BQ43G0902': 'GS1.1.1744804578.1.0.1744804578.0.0.0',
# '_ga': 'GA1.1.1289432087.1744804578',
# 'ASP.NET_SessionId': 'lexvhplwoxq2z1tjh5nukaj2',
# }
# except Exception as e:
# [Link]("CRITICAL ERROR during initialization",
# error_details=str(e),
# traceback=traceback.format_exc())
# [Link] = False
# self.error_message = f"Initialization error: {str(e)}"
# raise
# # this function fetch starting page response and send to parse
# def start_requests(self):
# try:
# req_url = '[Link]
surrender-members'
# [Link](f"Starting data extraction process",
# url=req_url,
# source=self.source_name,
# source_description=self.source_description)
#
# if not hasattr(self, 'proxy_helper') or self.proxy_helper is None:
# error_message = "proxy_helper is not initialized properly"
# [Link](error_message)
# [Link] = False
# self.error_message = error_message
# return
#
# [Link](f"Fetching initial content with proxy retry helper",
url=req_url)
#
# try:
# html_content = self.proxy_helper.fetch_with_retries(req_url)
# except Exception as proxy_error:
# error_message = f"Error during proxy fetch: {str(proxy_error)}"
# [Link](error_message,
# error_details=str(proxy_error),
# traceback=traceback.format_exc())
# [Link] = False
# self.error_message = error_message
# return
#
# if html_content:
# # Create a TextResponse object to parse the HTML
# response = TextResponse(url=req_url, body=html_content,
encoding='utf-8')
# [Link](f"Successfully retrieved content, proceeding to
parse",
# content_length=len(html_content))
#
# # Create a new request with proper headers
# yield [Link](
# url=req_url,
# callback=[Link],
# dont_filter=True,
# meta={
# 'response_object': response,
# 'original_headers': [Link],
# 'original_cookies':[Link]
# },
# headers=[Link],
# cookies=[Link])
# else:
# error_message = f"Failed to retrieve data from {req_url} after
all retry attempts"
# [Link](error_message, url=req_url)
# [Link] = False
# self.error_message = error_message
# return []
#
# except Exception as e:
# error_message = f"Critical error in start_requests: {str(e)}"
# [Link](error_message,
# error_details=str(e),
# traceback=traceback.format_exc())
# [Link] = False
# self.error_message = error_message
# return []
#
# # this function is fetching excel xpath and send response in extract row data
function
# def parse(self, response, **kwargs):
# try:
# [Link](f"Starting parsing process", url=[Link])
#
# # Get headers from meta
# headers = [Link]('original_headers', {})
# cookies = [Link]('original_cookies', {})
#
# if 'response_object' in [Link]:
# response = [Link]['response_object']
# [Link]("Using pre-fetched response from meta")
#
# try:
# relative_url = [Link](
# '//div[@class="clearfix
mrT20"]//div[@class="main-pdf-box"]/div[@class="pdf-box excel-box"]/p/a/@href'
# ).get()
#
# # Check if we found the expected data structure
# if not relative_url:
# [Link]("Excel file link not found in initial parse
attempt",
# xpath_used='//div[@class="clearfix
mrT20"]//div[@class="main-pdf-box"]/div[@class="pdf-box excel-box"]/p/a/@href')
#
# try:
# [Link]("Attempting retry to fetch content",
url=[Link])
# html_content =
self.proxy_helper.fetch_with_retries([Link])
#
# if html_content:
# response = TextResponse(url=[Link],
body=html_content, encoding='utf-8')
# relative_url = [Link](
# '//div[@class="clearfix
mrT20"]//div[@class="main-pdf-box"]/div[@class="pdf-box excel-box"]/p/a/@href'
# ).get()
#
# if not relative_url:
# error_message = "Failed to find Excel file link
even after retry"
# [Link](error_message,
# xpath_used='//div[@class="clearfix
mrT20"]//div[@class="main-pdf-box"]/div[@class="pdf-box excel-box"]/p/a/@href')
# [Link] = False
# self.error_message = error_message
# return
# else:
# [Link]("Successfully retrieved Excel link
after retry")
# else:
# error_message = f"All proxy attempts failed for
{[Link]}"
# [Link](error_message, url=[Link])
# [Link] = False
# self.error_message = error_message
# return
# except Exception as retry_error:
# error_message = f"Error during retry attempt:
{str(retry_error)}"
# [Link](error_message,
# error_details=str(retry_error),
# traceback=traceback.format_exc())
# [Link] = False
# self.error_message = error_message
# return
# else:
# [Link]("Found Excel file link in initial parse")
#
# # Process the Excel file link
# if relative_url:
# full_url = urljoin([Link], relative_url)
# [Link](f"Found Excel file link, proceeding to download",
# excel_url=full_url)
#
# try:
# # Create a new request with the original headers
# yield [Link](
# url=full_url,
# headers=headers,
# cookies=cookies,
# callback=self._extract_row_data,
# meta={
# 'original_headers': headers,
# 'original_cookies':cookies}
# )
# except Exception as request_error:
# error_message = f"Error creating Excel download request:
{str(request_error)}"
# [Link](error_message,
# error_details=str(request_error),
# traceback=traceback.format_exc())
# [Link] = False
# self.error_message = error_message
# return
#
# except Exception as parse_error:
# error_message = f"Error while parsing HTML content:
{str(parse_error)}"
# [Link](error_message,
# error_details=str(parse_error),
# traceback=traceback.format_exc())
# [Link] = False
# self.error_message = error_message
#
# except Exception as e:
# error_message = f"Unexpected critical error in parse method:
{str(e)}"
# [Link](error_message,
# error_details=str(e),
# traceback=traceback.format_exc())
# [Link] = False
# self.error_message = error_message
# # this function is used for fetch excel inner row data properly and manage
some columns
# def _extract_row_data(self, response):
# try:
# excel_url = getattr(response, 'url', 'URL_NOT_AVAILABLE')
# [Link]("Starting Excel data extraction process",
# url=excel_url,
# source=self.source_name,
# source_description=self.source_description)
#
# extracted_data = []
# unclean_records = []
#
# try:
# xls_file = BytesIO([Link])
# xls = [Link](xls_file, engine='openpyxl')
# # because we have 2 sheets so we need to do like this for get
data one by one
# for sheet_name in xls.sheet_names:
# try:
# [Link](f"Processing sheet: {sheet_name}")
# df = [Link](sheet_name)
# [Link] = [Link]()
# raw_df = [Link]()
# raw_df['status'] = sheet_name
# unclean_records.extend(raw_df.to_dict('records'))
# required_cols = ['Name of the Member', 'Member ID',
# 'Date of Approval for Surrender of
membership',
# 'SEBI Reg. No']
# for col in required_cols:
# if col not in [Link]:
# df[col] = ''
# if 'Member ID' in [Link]:
# df['Member ID'] = df['Member
ID'].astype(str).[Link]()
# # need to do date format like yyyy/mm/dd format
# if 'Date of Approval for Surrender of membership' in
[Link]:
# df['Date of Approval for Surrender of membership'] =
pd.to_datetime(
# df['Date of Approval for Surrender of
membership'], errors='coerce'
# ).[Link]('%Y/%m/%d')
# df['Date of Approval for Surrender of membership'] =
df[
# 'Date of Approval for Surrender of
membership'].fillna('')
#
# df['status'] = sheet_name
#
# for _, row in [Link]():
# try:
# record = {
# 'Name of the Member': '',
# 'Member ID': '',
# 'SEBI Reg. No': '',
# 'Date of Approval for Surrender of
membership': '',
# 'status': sheet_name,
# 'Alias': '',
# 'Proprietor': ''
# }
#
# for field in ['Name of the Member', 'Member ID',
# 'Date of Approval for Surrender of
membership',
# 'SEBI Reg. No']:
# if field in row:
# record[field] = str(row[field]).strip()
if [Link](row[field]) else ''
#
# name = record['Name of the Member']
# # need to check proprietor if getting
# if 'Proprietor' in name:
# proprietor_match = [Link](r'Proprietor
(.*?)(?=\s*$|\s*\(|$)', name)
# if proprietor_match:
# record['Proprietor']
=clean_text(proprietor_match.group(1))
# name = [Link](f"Proprietor
{proprietor_match.group(1)}", "").strip()
# # in alias we getting like this data so need to
remove this starting
# alias_phrases = [
# r'Formerly Known as (.*?)(?=\s*$|\s*\(|$)',
# r'Currently known as (.*?)(?=\s*$|\s*\(|$)',
# r'A Division of (.*?)(?=\s*$|\s*\(|$)',
# r'Division of (.*?)(?=\s*$|\s*\(|$)',
# r'Commodity trading division - (.*?)(?=\s*$|\
s*\(|$)',
# r'\((.*?)\)'
# ]
#
# alias = None
# for phrase in alias_phrases:
# match = [Link](phrase, name,
[Link])
# if match:
# alias =clean_text([Link](1))
# name = [Link]([Link](0),
"").strip()
# break
# # if found this word then need to ignore this
because this is not Alias here
# if alias:
# ignore_list = ['india', 'plastic', 'p',
'gujarat', 'pvt.', 'pvt', 'i', 'l', 'guj',
# 'Import & Export', 'Agencies',
'HUF']
# if [Link]() not in [[Link]() for x in
ignore_list]:
# record['Alias'] = alias
#
# record['Name of the Member'] =clean_text(name)
# extracted_data.append(record)
#
# except Exception as row_error:
# [Link](f"Error processing row in sheet
{sheet_name}",
# error_details=str(row_error))
# continue
#
# except Exception as sheet_error:
# [Link](f"Error processing sheet {sheet_name}",
# error_details=str(sheet_error),
# traceback=traceback.format_exc())
# continue
#
# if unclean_records:
# unclean_df = [Link](unclean_records)
# self._save_uncleaned_data(unclean_df)
# if not extracted_data:
# error_message = "No data extracted from Excel file"
# [Link](error_message, url=excel_url)
# [Link] = False
# self.error_message = error_message
# return
#
# try:
# self.data_df = [Link](extracted_data)
# self.data_df.insert(0, 'Sr. No.', range(1, len(self.data_df)
+ 1))
# data=self.data_df.to_dict('records')
# self.data_df = [Link](data)
#
# self._process_save_data( self.data_df )
#
# [Link] = True
# self.error_message = None
#
# [Link](f"Excel data extraction completed successfully",
# records_processed=len(self.data_df),
# columns=list(self.data_df.columns),
# url=excel_url)
#
# except Exception as df_error:
# error_message = f"Error creating DataFrame: {str(df_error)}"
# [Link](error_message,
# error_details=str(df_error),
# traceback=traceback.format_exc(),
# url=excel_url)
# [Link] = False
# self.error_message = error_message
#
# except Exception as excel_error:
# error_message = f"Error processing Excel file:
{str(excel_error)}"
# [Link](error_message,
# error_details=str(excel_error),
# traceback=traceback.format_exc(),
# url=excel_url)
# [Link] = False
# self.error_message = error_message
#
# except Exception as e:
# error_message = f"Unexpected critical error in Excel extraction:
{str(e)}"
# [Link](error_message,
# error_details=str(e),
# traceback=traceback.format_exc())
# [Link] = False
# self.error_message = error_message
# raise
#
# # this function for save excel data which is cleaned
# def _process_save_data(self, data):
# try:
# [Link](f"converting cleaned {len(data)} records to dataframe")
# self.data_df=data
# source_url = '[Link]
of-surrender-members'
# if 'Sr. No.' in self.data_df.columns:
# self.data_df.insert(1, 'Source URL', source_url)
# else:
# self.data_df.insert(0, 'Source URL', source_url)
#
# output_dir = f"output_files/{[Link]}/"
# [Link](output_dir, exist_ok=True)
# timestamp = [Link]().strftime('%Y-%m-%d_%H%M%S')
# self.output_file_path =
f"{output_dir}/{self.source_name}_{timestamp}.xlsx"
#
# self.data_df.to_excel(self.output_file_path, index=False)
#
# [Link](f"Data successfully saved",
# filerecord_count_path=self.output_file_path,
# record_count=len(self.data_df),
# columns=list(self.data_df.columns),
# timestamp=[Link]().strftime('%Y-%m-%d %H:%M:%S'))
#
# except Exception as e:
# [Link](f"Error saving data to Excel",
# error_details=str(e),
# traceback=traceback.format_exc())
# raise
# # this function store uncleaned data in excel
# def _save_uncleaned_data(self, unclean_df):
# try:
# [Link](f"converting unclean {len(unclean_df)} records to
dataframe")
#
# if 'Sr. No.' in unclean_df.columns:
# unclean_df = unclean_df.drop(columns=['Sr. No.'])
# if 'Date of Approval for Surrender of membership' in
unclean_df.columns:
# unclean_df['Date of Approval for Surrender of membership'] =
pd.to_datetime(
# unclean_df['Date of Approval for Surrender of membership'],
# errors='coerce',
# dayfirst=True
# )
# unclean_df['Date of Approval for Surrender of membership'] =
unclean_df[
# 'Date of Approval for Surrender of membership'
# ].[Link]('%d-%b-%y')
# unclean_df['Date of Approval for Surrender of membership'] =
unclean_df[
# 'Date of Approval for Surrender of membership'
# ].fillna('')
#
# source_url = '[Link]
of-surrender-members'
# unclean_df.insert(0, 'Sr. No.', range(1, len(unclean_df) + 1))
#
# input_dir = f"input_files/{[Link]}/"
# [Link](input_dir, exist_ok=True)
# timestamp = [Link]().strftime('%Y-%m-%d_%H%M%S')
# output_path = f"{input_dir}/{self.source_name}_{timestamp}.xlsx"
#
# unclean_df.to_excel(output_path, index=False, engine='openpyxl')
#
# [Link](f"Uncleaned data saved as Excel with exact date format
(dd-mmm-yy)",
# file_path=output_path,
# record_count=len(unclean_df),
# columns=list(unclean_df.columns))
#
# except Exception as e:
# [Link](f"Error saving uncleaned data",
# error_details=str(e),
# traceback=traceback.format_exc())
# raise
#
# def closed(self, reason):
# row_count = 0
# if hasattr(self, 'data_df') and isinstance(self.data_df, [Link])
and not self.data_df.empty:
# row_count = len(self.data_df)
#
# if hasattr(self, 'success') and [Link]:
# [Link](f"Spider closed successfully, updating status with
{row_count} records",
# source_id=self.source_id,
# source_description=self.source_description)
# update_scraping_status_on_success(self.source_id, row_count)
#
# if hasattr(self, 'output_file_path') and self.output_file_path:
# [Link](f"Uploading results to S3",
# file_path=self.output_file_path,
# source_id=self.source_id,
# source_description=self.source_description)
# upload_to_s3_and_update_db(self.output_file_path,
self.source_dict, logger)
# else:
# [Link]("No output file path available, skipping S3
upload",
# source_id=self.source_id,
# source_description=self.source_description)
# else:
# error_msg = self.error_message if hasattr(self,
# 'error_message') and
self.error_message else f"Spider closed with reason: {reason}"
# [Link](f"Spider closed with errors, updating failure status:
{error_msg}",
# source_id=self.source_id,
# source_description=self.source_description if
hasattr(self,
#
'source_description') else "unknown")
# update_scraping_status_on_failure(self.source_id, error_msg,
row_count)
#
#
# def run(source_dict=None):
# try:
# source_id = source_dict.get("source_id")
# source_description = source_dict.get("source_description", "List of
Surrendered/Ceased Members")
#
# [Link]("Starting spider execution",
# source_id=source_id,
# source_description=source_description)
#
# try:
# from [Link] import CrawlerProcess
# process = CrawlerProcess(settings)
# [Link](McxSpiderDataSpider, source_dict=source_dict)
# [Link]()
# except Exception as crawler_error:
# [Link](f"CrawlerProcess approach failed:
{str(crawler_error)}. Trying fallback method.",
# error_details=str(crawler_error))
#
# spider = McxSpiderDataSpider(source_dict=source_dict)
# try:
# requests = list(spider.start_requests() or [])
# if requests:
# for request in requests:
# response = None
# if 'response_object' in [Link]:
# response = [Link]['response_object']
# list([Link](response) or [])
# else:
# [Link]("No requests generated from start_requests")
# [Link](reason="finished")
# except Exception as manual_error:
# error_message = f"Error in manual spider execution:
{str(manual_error)}"
# [Link](error_message,
# error_details=str(manual_error),
# traceback=traceback.format_exc())
# raise
#
# [Link]("Spider execution completed successfully")
# return True
# except Exception as e:
# error_message = f"Critical error running spider: {str(e)}"
# [Link](error_message,
# error_details=str(e),
# traceback=traceback.format_exc(),
# source_id=source_id,
# source_description=source_description)
# update_scraping_status_on_failure(source_id, error_message, 0)
# return False
#
#
# if __name__ == '__main__':
# hardcoded_source_dict = {
# "source_id": "mcxindia_spider",
# "id": 77,
# "source_name": "www_mcxindia_com",
# "source_description": "List of Surrendered/Ceased Members",
# "count": 0
# }
# run(hardcoded_source_dict)
import scrapy
import pandas as pd
import re
import os
import traceback
import warnings
import pycountry
from io import BytesIO
from datetime import datetime
from [Link] import urljoin
from [Link] import TextResponse
from src.helpers_scrapy.scrap_logger import ScrapperLogger
from src.helpers_scrapy.global_helpers import update_scraping_status_on_success,
update_scraping_status_on_failure, \
upload_to_s3_and_update_db
from src.helpers_scrapy.proxy_retry_helper import ProxyRetryHelper
[Link]("ignore", category=UserWarning, module="openpyxl")
settings = {
'BOT_NAME': 'watchlist_scrapper',
'ROBOTSTXT_OBEY': False,
'CONCURRENT_REQUESTS': 1,
'DOWNLOAD_DELAY': 3,
'COOKIES_ENABLED': True,
'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36
(KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
}
# this library called pycountry provide country names so use it for get country so
same as and also save () sign
VALID_COUNTRIES = {[Link]() for c in [Link]}.union(
{c.alpha_2.lower() for c in [Link]}
)
def clean_text(text):
text_str = str(text)
print(text)
store=text_str.split(' ')
print(store)
text_str = [Link](r'\((?!.*?\)).*', lambda m: [Link](0) + ')', text_str)
text_processed = [Link](
r'\(([^)]*)\)',
lambda m: f'({[Link](1)})' if [Link](1).strip().lower() in
VALID_COUNTRIES else [Link](1),
text_str
)
text_with_spaces = [Link](r'[^\w\s/\'\-&()]', ' ', text_processed)
cleaned_data = [Link](r'\s+', ' ', text_with_spaces).strip()
return cleaned_data if cleaned_data else ""
class DataSpiderSpider([Link]):
name = "mcxindia_spider"
source_name = "www_mcxindia_com"
source_id = None
# headers
[Link] = {
'Accept':
'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/
webp,image/apng,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.9',
'Connection': 'keep-alive',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)
AppleWebKit/537.36 (KHTML, like Gecko) Chrome/[Link] Safari/537.36'
}
[Link] = {
'_gid': 'GA1.2.1382501654.1744804578',
'_ga_8BQ43G0902': 'GS1.1.1744804578.1.0.1744804578.0.0.0',
'_ga': 'GA1.1.1289432087.1744804578',
'ASP.NET_SessionId': 'lexvhplwoxq2z1tjh5nukaj2',
}
except Exception as e:
[Link]("CRITICAL ERROR during initialization",
error_details=str(e),
traceback=traceback.format_exc())
[Link] = False
self.error_message = f"Initialization error: {str(e)}"
raise
# this function fetch starting page response and send to parse
def start_requests(self):
try:
req_url = '[Link]
surrender-members'
[Link](f"Starting data extraction process",
url=req_url,
source=self.source_name,
source_description=self.source_description)
try:
html_content = self.proxy_helper.fetch_with_retries(req_url)
except Exception as proxy_error:
error_message = f"Error during proxy fetch: {str(proxy_error)}"
[Link](error_message,
error_details=str(proxy_error),
traceback=traceback.format_exc())
[Link] = False
self.error_message = error_message
return
if html_content:
# Create a TextResponse object to parse the HTML
response = TextResponse(url=req_url, body=html_content,
encoding='utf-8')
[Link](f"Successfully retrieved content, proceeding to parse",
content_length=len(html_content))
except Exception as e:
error_message = f"Critical error in start_requests: {str(e)}"
[Link](error_message,
error_details=str(e),
traceback=traceback.format_exc())
[Link] = False
self.error_message = error_message
return []
# this function is fetching excel xpath and send response in extract row data
function
def parse(self, response, **kwargs):
try:
[Link](f"Starting parsing process", url=[Link])
if 'response_object' in [Link]:
response = [Link]['response_object']
[Link]("Using pre-fetched response from meta")
try:
relative_url = [Link](
'//div[@class="clearfix
mrT20"]//div[@class="main-pdf-box"]/div[@class="pdf-box excel-box"]/p/a/@href'
).get()
try:
[Link]("Attempting retry to fetch content",
url=[Link])
html_content =
self.proxy_helper.fetch_with_retries([Link])
if html_content:
response = TextResponse(url=[Link],
body=html_content, encoding='utf-8')
relative_url = [Link](
'//div[@class="clearfix mrT20"]//div[@class="main-
pdf-box"]/div[@class="pdf-box excel-box"]/p/a/@href'
).get()
if not relative_url:
error_message = "Failed to find Excel file link
even after retry"
[Link](error_message,
xpath_used='//div[@class="clearfix
mrT20"]//div[@class="main-pdf-box"]/div[@class="pdf-box excel-box"]/p/a/@href')
[Link] = False
self.error_message = error_message
return
else:
[Link]("Successfully retrieved Excel link
after retry")
else:
error_message = f"All proxy attempts failed for
{[Link]}"
[Link](error_message, url=[Link])
[Link] = False
self.error_message = error_message
return
except Exception as retry_error:
error_message = f"Error during retry attempt:
{str(retry_error)}"
[Link](error_message,
error_details=str(retry_error),
traceback=traceback.format_exc())
[Link] = False
self.error_message = error_message
return
else:
[Link]("Found Excel file link in initial parse")
try:
# Create a new request for inner response so need to do
change here for loop of table to scrapy request
yield [Link](
url=full_url,
headers=headers,
cookies=cookies,
callback=self._extract_row_data,
meta={
'original_headers': headers,
'original_cookies':cookies}
)
except Exception as request_error:
error_message = f"Error creating Excel download request:
{str(request_error)}"
[Link](error_message,
error_details=str(request_error),
traceback=traceback.format_exc())
[Link] = False
self.error_message = error_message
return
except Exception as e:
error_message = f"Unexpected critical error in parse method: {str(e)}"
[Link](error_message,
error_details=str(e),
traceback=traceback.format_exc())
[Link] = False
self.error_message = error_message
extracted_data = []
unclean_records = []
try:
xls_file = BytesIO([Link])
xls = [Link](xls_file, engine='openpyxl')
df['status'] = sheet_name
if unclean_records:
unclean_df = [Link](unclean_records)
self._save_uncleaned_data(unclean_df)
if not extracted_data:
error_message = "No data extracted from Excel file"
[Link](error_message, url=excel_url)
[Link] = False
self.error_message = error_message
return
try:
self.data_df = [Link](extracted_data)
self.data_df.insert(0, 'Sr. No.', range(1, len(self.data_df) +
1))
data = self.data_df.to_dict('records')
self.data_df = [Link](data)
self._process_save_data(self.data_df)
[Link] = True
self.error_message = None
except Exception as e:
error_message = f"Unexpected critical error in Excel extraction:
{str(e)}"
[Link](error_message,
error_details=str(e),
traceback=traceback.format_exc())
[Link] = False
self.error_message = error_message
raise
def is_as_is_content(self,content):
PROCESS_PHRASES = [
r'Formerly Known as',
r'Formarely Known as',
r'Currently known as',
r'A Division of',
r'Division of'
]
"""
Returns True if content should be kept as-is (doesn't start with any
PROCESS_PHRASES)
"""
content_lower = [Link]().strip()
return not any(
[Link]([Link]([Link]()), content_lower)
for phrase in PROCESS_PHRASES
)
alias_phrases = [
r'Formerly Known as (.*?)(?=\s*$|\s*\(|$)',
r'Formarely Known as (.*?)(?=\s*$|\s*\(|$)',
r'Formarely Known as (.*?)(?=\s*$|\s*\(|$)',
r'Currently known as (.*?)(?=\s*$|\s*\(|$)',
]
additional_phrases = [
r'A Division of (.*?)(?=\s*$|\s*\(|$)',
r'Division of (.*?)(?=\s*$|\s*\(|$)',
r'\(A Division of ([^\(\)]*)'
additional_info = ''
alias = ''
output_dir = f"output_files/{[Link]}/"
[Link](output_dir, exist_ok=True)
timestamp = [Link]().strftime('%Y-%m-%d_%H%M%S')
self.output_file_path =
f"{output_dir}/{self.source_name}_{timestamp}.xlsx"
self.data_df.to_excel(self.output_file_path, index=False)
except Exception as e:
[Link](f"Error saving data to Excel",
error_details=str(e),
traceback=traceback.format_exc())
raise
source_url = '[Link]
surrender-members'
unclean_df.insert(0, 'Sr. No.', range(1, len(unclean_df) + 1))
input_dir = f"input_files/{[Link]}/"
[Link](input_dir, exist_ok=True)
timestamp = [Link]().strftime('%Y-%m-%d_%H%M%S')
output_path = f"{input_dir}/{self.source_name}_{timestamp}.xlsx"
except Exception as e:
[Link](f"Error saving uncleaned data",
error_details=str(e),
traceback=traceback.format_exc())
raise
def run(source_dict=None):
"""
Run the spider with the given source dictionary.
This function can be called from other scripts.
Args:
source_dict: Dictionary containing source information
Returns:
True if the spider ran successfully, False otherwise
"""
try:
# Extract source ID from the source_dict
source_id = source_dict.get("source_id")
source_description = source_dict.get("source_description", "BIS Doc
Government Source")
try:
# Try the CrawlerProcess approach first
from [Link] import CrawlerProcess
# Manual status update in case the spider's closed method wasn't called
update_scraping_status_on_failure(source_id, error_message, 0)
return False
if __name__ == '__main__':
hardcoded_source_dict = {
"source_id": "mcxindia_spider",
"id": 77,
"source_name": "www_mcxindia_com",
"source_description": "List of Surrendered/Ceased Members",
"count": 0
}
run(hardcoded_source_dict)