0% found this document useful (0 votes)
17 views24 pages

MCX India Spider Data Extraction Script

The document outlines a Scrapy spider implementation for extracting data from the MCX India website, specifically targeting a list of surrendered or ceased members. It includes initialization, request handling, and data extraction from Excel files, with error handling and logging mechanisms throughout the process. The spider uses a proxy retry helper to manage connection issues and processes the data into a structured format for further use.

Uploaded by

dhruvchavda447
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
17 views24 pages

MCX India Spider Data Extraction Script

The document outlines a Scrapy spider implementation for extracting data from the MCX India website, specifically targeting a list of surrendered or ceased members. It includes initialization, request handling, and data extraction from Excel files, with error handling and logging mechanisms throughout the process. The spider uses a proxy retry helper to manage connection issues and processes the data into a structured format for further use.

Uploaded by

dhruvchavda447
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd

# import scrapy

# import pandas as pd
# import re
# import os
# import traceback
# import warnings
# from io import BytesIO
# from datetime import datetime
# from [Link] import urljoin
#
# from [Link] import TextResponse
#
# from src.helpers_scrapy.scrap_logger import ScrapperLogger
# from src.helpers_scrapy.global_helpers import update_scraping_status_on_success,
update_scraping_status_on_failure, \
# upload_to_s3_and_update_db
# from src.helpers_scrapy.proxy_retry_helper import ProxyRetryHelper
#
# [Link]("ignore", category=UserWarning, module="openpyxl")
#
# logger = ScrapperLogger(script_name="mcxindia_spider", source="www_mcxindia_com")
#
# settings = {
# 'BOT_NAME': 'w1atchlist_scrapper',
# 'ROBOTSTXT_OBEY': False,
# 'CONCURRENT_REQUESTS': 1,
# 'DOWNLOAD_DELAY': 3,
# 'COOKIES_ENABLED': True,
# 'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36
(KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
# }
#
#
# def clean_text(text):
# text_with_spaces = [Link](r'[^\w\s/\'\-&]', ' ', str(text))
# cleaned_data = [Link](r'\s+', ' ', text_with_spaces).strip()
# return cleaned_data if cleaned_data else ""
# class McxSpiderDataSpider([Link]):
# name = "mcxindia_spider"
# source_name = "www_mcxindia_com"
# source_id = None
#
# def __init__(self, source_dict=None, *args, **kwargs):
# try:
# super(McxSpiderDataSpider, self).__init__(*args, **kwargs)
# if source_dict and "source_id" in source_dict:
# self.source_id = source_dict.get("source_id")
# self.source_description = source_dict.get("source_description",
# "List of Surrendered/Ceased
Members") if source_dict else "List of Surrendered/Ceased Members"
# [Link](f"Initializing {[Link]} spider for
{self.source_name}",
# source_description=self.source_description)
# self.proxy_helper = ProxyRetryHelper(logger)
# self.data_df = None
# self.output_file_path = None
# self.source_dict = source_dict
# self.error_message = None
# [Link] = True
#
# # headers
# [Link] = {
# 'Accept':
'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/
webp,image/apng,*/*;q=0.8',
# 'Accept-Language': 'en-US,en;q=0.9',
# 'Connection': 'keep-alive',
# 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)
AppleWebKit/537.36 (KHTML, like Gecko) Chrome/[Link] Safari/537.36'
# }
# [Link] = {
# '_gid': 'GA1.2.1382501654.1744804578',
# '_ga_8BQ43G0902': 'GS1.1.1744804578.1.0.1744804578.0.0.0',
# '_ga': 'GA1.1.1289432087.1744804578',
# 'ASP.NET_SessionId': 'lexvhplwoxq2z1tjh5nukaj2',
# }
# except Exception as e:
# [Link]("CRITICAL ERROR during initialization",
# error_details=str(e),
# traceback=traceback.format_exc())
# [Link] = False
# self.error_message = f"Initialization error: {str(e)}"
# raise
# # this function fetch starting page response and send to parse
# def start_requests(self):
# try:
# req_url = '[Link]
surrender-members'
# [Link](f"Starting data extraction process",
# url=req_url,
# source=self.source_name,
# source_description=self.source_description)
#
# if not hasattr(self, 'proxy_helper') or self.proxy_helper is None:
# error_message = "proxy_helper is not initialized properly"
# [Link](error_message)
# [Link] = False
# self.error_message = error_message
# return
#
# [Link](f"Fetching initial content with proxy retry helper",
url=req_url)
#
# try:
# html_content = self.proxy_helper.fetch_with_retries(req_url)
# except Exception as proxy_error:
# error_message = f"Error during proxy fetch: {str(proxy_error)}"
# [Link](error_message,
# error_details=str(proxy_error),
# traceback=traceback.format_exc())
# [Link] = False
# self.error_message = error_message
# return
#
# if html_content:
# # Create a TextResponse object to parse the HTML
# response = TextResponse(url=req_url, body=html_content,
encoding='utf-8')
# [Link](f"Successfully retrieved content, proceeding to
parse",
# content_length=len(html_content))
#
# # Create a new request with proper headers
# yield [Link](
# url=req_url,
# callback=[Link],
# dont_filter=True,
# meta={
# 'response_object': response,
# 'original_headers': [Link],
# 'original_cookies':[Link]
# },
# headers=[Link],
# cookies=[Link])
# else:
# error_message = f"Failed to retrieve data from {req_url} after
all retry attempts"
# [Link](error_message, url=req_url)
# [Link] = False
# self.error_message = error_message
# return []
#
# except Exception as e:
# error_message = f"Critical error in start_requests: {str(e)}"
# [Link](error_message,
# error_details=str(e),
# traceback=traceback.format_exc())
# [Link] = False
# self.error_message = error_message
# return []
#
# # this function is fetching excel xpath and send response in extract row data
function
# def parse(self, response, **kwargs):
# try:
# [Link](f"Starting parsing process", url=[Link])
#
# # Get headers from meta
# headers = [Link]('original_headers', {})
# cookies = [Link]('original_cookies', {})
#
# if 'response_object' in [Link]:
# response = [Link]['response_object']
# [Link]("Using pre-fetched response from meta")
#
# try:
# relative_url = [Link](
# '//div[@class="clearfix
mrT20"]//div[@class="main-pdf-box"]/div[@class="pdf-box excel-box"]/p/a/@href'
# ).get()
#
# # Check if we found the expected data structure
# if not relative_url:
# [Link]("Excel file link not found in initial parse
attempt",
# xpath_used='//div[@class="clearfix
mrT20"]//div[@class="main-pdf-box"]/div[@class="pdf-box excel-box"]/p/a/@href')
#
# try:
# [Link]("Attempting retry to fetch content",
url=[Link])
# html_content =
self.proxy_helper.fetch_with_retries([Link])
#
# if html_content:
# response = TextResponse(url=[Link],
body=html_content, encoding='utf-8')
# relative_url = [Link](
# '//div[@class="clearfix
mrT20"]//div[@class="main-pdf-box"]/div[@class="pdf-box excel-box"]/p/a/@href'
# ).get()
#
# if not relative_url:
# error_message = "Failed to find Excel file link
even after retry"
# [Link](error_message,
# xpath_used='//div[@class="clearfix
mrT20"]//div[@class="main-pdf-box"]/div[@class="pdf-box excel-box"]/p/a/@href')
# [Link] = False
# self.error_message = error_message
# return
# else:
# [Link]("Successfully retrieved Excel link
after retry")
# else:
# error_message = f"All proxy attempts failed for
{[Link]}"
# [Link](error_message, url=[Link])
# [Link] = False
# self.error_message = error_message
# return
# except Exception as retry_error:
# error_message = f"Error during retry attempt:
{str(retry_error)}"
# [Link](error_message,
# error_details=str(retry_error),
# traceback=traceback.format_exc())
# [Link] = False
# self.error_message = error_message
# return
# else:
# [Link]("Found Excel file link in initial parse")
#
# # Process the Excel file link
# if relative_url:
# full_url = urljoin([Link], relative_url)
# [Link](f"Found Excel file link, proceeding to download",
# excel_url=full_url)
#
# try:
# # Create a new request with the original headers
# yield [Link](
# url=full_url,
# headers=headers,
# cookies=cookies,
# callback=self._extract_row_data,
# meta={
# 'original_headers': headers,
# 'original_cookies':cookies}
# )
# except Exception as request_error:
# error_message = f"Error creating Excel download request:
{str(request_error)}"
# [Link](error_message,
# error_details=str(request_error),
# traceback=traceback.format_exc())
# [Link] = False
# self.error_message = error_message
# return
#
# except Exception as parse_error:
# error_message = f"Error while parsing HTML content:
{str(parse_error)}"
# [Link](error_message,
# error_details=str(parse_error),
# traceback=traceback.format_exc())
# [Link] = False
# self.error_message = error_message
#
# except Exception as e:
# error_message = f"Unexpected critical error in parse method:
{str(e)}"
# [Link](error_message,
# error_details=str(e),
# traceback=traceback.format_exc())
# [Link] = False
# self.error_message = error_message
# # this function is used for fetch excel inner row data properly and manage
some columns
# def _extract_row_data(self, response):
# try:
# excel_url = getattr(response, 'url', 'URL_NOT_AVAILABLE')
# [Link]("Starting Excel data extraction process",
# url=excel_url,
# source=self.source_name,
# source_description=self.source_description)
#
# extracted_data = []
# unclean_records = []
#
# try:
# xls_file = BytesIO([Link])
# xls = [Link](xls_file, engine='openpyxl')
# # because we have 2 sheets so we need to do like this for get
data one by one
# for sheet_name in xls.sheet_names:
# try:
# [Link](f"Processing sheet: {sheet_name}")
# df = [Link](sheet_name)
# [Link] = [Link]()
# raw_df = [Link]()
# raw_df['status'] = sheet_name
# unclean_records.extend(raw_df.to_dict('records'))
# required_cols = ['Name of the Member', 'Member ID',
# 'Date of Approval for Surrender of
membership',
# 'SEBI Reg. No']
# for col in required_cols:
# if col not in [Link]:
# df[col] = ''
# if 'Member ID' in [Link]:
# df['Member ID'] = df['Member
ID'].astype(str).[Link]()
# # need to do date format like yyyy/mm/dd format
# if 'Date of Approval for Surrender of membership' in
[Link]:
# df['Date of Approval for Surrender of membership'] =
pd.to_datetime(
# df['Date of Approval for Surrender of
membership'], errors='coerce'
# ).[Link]('%Y/%m/%d')
# df['Date of Approval for Surrender of membership'] =
df[
# 'Date of Approval for Surrender of
membership'].fillna('')
#
# df['status'] = sheet_name
#
# for _, row in [Link]():
# try:
# record = {
# 'Name of the Member': '',
# 'Member ID': '',
# 'SEBI Reg. No': '',
# 'Date of Approval for Surrender of
membership': '',
# 'status': sheet_name,
# 'Alias': '',
# 'Proprietor': ''
# }
#
# for field in ['Name of the Member', 'Member ID',
# 'Date of Approval for Surrender of
membership',
# 'SEBI Reg. No']:
# if field in row:
# record[field] = str(row[field]).strip()
if [Link](row[field]) else ''
#
# name = record['Name of the Member']
# # need to check proprietor if getting
# if 'Proprietor' in name:
# proprietor_match = [Link](r'Proprietor
(.*?)(?=\s*$|\s*\(|$)', name)
# if proprietor_match:
# record['Proprietor']
=clean_text(proprietor_match.group(1))
# name = [Link](f"Proprietor
{proprietor_match.group(1)}", "").strip()
# # in alias we getting like this data so need to
remove this starting
# alias_phrases = [
# r'Formerly Known as (.*?)(?=\s*$|\s*\(|$)',
# r'Currently known as (.*?)(?=\s*$|\s*\(|$)',
# r'A Division of (.*?)(?=\s*$|\s*\(|$)',
# r'Division of (.*?)(?=\s*$|\s*\(|$)',
# r'Commodity trading division - (.*?)(?=\s*$|\
s*\(|$)',
# r'\((.*?)\)'
# ]
#
# alias = None
# for phrase in alias_phrases:
# match = [Link](phrase, name,
[Link])
# if match:
# alias =clean_text([Link](1))
# name = [Link]([Link](0),
"").strip()
# break
# # if found this word then need to ignore this
because this is not Alias here
# if alias:
# ignore_list = ['india', 'plastic', 'p',
'gujarat', 'pvt.', 'pvt', 'i', 'l', 'guj',
# 'Import & Export', 'Agencies',
'HUF']
# if [Link]() not in [[Link]() for x in
ignore_list]:
# record['Alias'] = alias
#
# record['Name of the Member'] =clean_text(name)
# extracted_data.append(record)
#
# except Exception as row_error:
# [Link](f"Error processing row in sheet
{sheet_name}",
# error_details=str(row_error))
# continue
#
# except Exception as sheet_error:
# [Link](f"Error processing sheet {sheet_name}",
# error_details=str(sheet_error),
# traceback=traceback.format_exc())
# continue
#
# if unclean_records:
# unclean_df = [Link](unclean_records)
# self._save_uncleaned_data(unclean_df)
# if not extracted_data:
# error_message = "No data extracted from Excel file"
# [Link](error_message, url=excel_url)
# [Link] = False
# self.error_message = error_message
# return
#
# try:
# self.data_df = [Link](extracted_data)
# self.data_df.insert(0, 'Sr. No.', range(1, len(self.data_df)
+ 1))
# data=self.data_df.to_dict('records')
# self.data_df = [Link](data)
#
# self._process_save_data( self.data_df )
#
# [Link] = True
# self.error_message = None
#
# [Link](f"Excel data extraction completed successfully",
# records_processed=len(self.data_df),
# columns=list(self.data_df.columns),
# url=excel_url)
#
# except Exception as df_error:
# error_message = f"Error creating DataFrame: {str(df_error)}"
# [Link](error_message,
# error_details=str(df_error),
# traceback=traceback.format_exc(),
# url=excel_url)
# [Link] = False
# self.error_message = error_message
#
# except Exception as excel_error:
# error_message = f"Error processing Excel file:
{str(excel_error)}"
# [Link](error_message,
# error_details=str(excel_error),
# traceback=traceback.format_exc(),
# url=excel_url)
# [Link] = False
# self.error_message = error_message
#
# except Exception as e:
# error_message = f"Unexpected critical error in Excel extraction:
{str(e)}"
# [Link](error_message,
# error_details=str(e),
# traceback=traceback.format_exc())
# [Link] = False
# self.error_message = error_message
# raise
#
# # this function for save excel data which is cleaned
# def _process_save_data(self, data):
# try:
# [Link](f"converting cleaned {len(data)} records to dataframe")
# self.data_df=data
# source_url = '[Link]
of-surrender-members'
# if 'Sr. No.' in self.data_df.columns:
# self.data_df.insert(1, 'Source URL', source_url)
# else:
# self.data_df.insert(0, 'Source URL', source_url)
#
# output_dir = f"output_files/{[Link]}/"
# [Link](output_dir, exist_ok=True)
# timestamp = [Link]().strftime('%Y-%m-%d_%H%M%S')
# self.output_file_path =
f"{output_dir}/{self.source_name}_{timestamp}.xlsx"
#
# self.data_df.to_excel(self.output_file_path, index=False)
#
# [Link](f"Data successfully saved",
# filerecord_count_path=self.output_file_path,
# record_count=len(self.data_df),
# columns=list(self.data_df.columns),
# timestamp=[Link]().strftime('%Y-%m-%d %H:%M:%S'))
#
# except Exception as e:
# [Link](f"Error saving data to Excel",
# error_details=str(e),
# traceback=traceback.format_exc())
# raise
# # this function store uncleaned data in excel
# def _save_uncleaned_data(self, unclean_df):
# try:
# [Link](f"converting unclean {len(unclean_df)} records to
dataframe")
#
# if 'Sr. No.' in unclean_df.columns:
# unclean_df = unclean_df.drop(columns=['Sr. No.'])
# if 'Date of Approval for Surrender of membership' in
unclean_df.columns:
# unclean_df['Date of Approval for Surrender of membership'] =
pd.to_datetime(
# unclean_df['Date of Approval for Surrender of membership'],
# errors='coerce',
# dayfirst=True
# )
# unclean_df['Date of Approval for Surrender of membership'] =
unclean_df[
# 'Date of Approval for Surrender of membership'
# ].[Link]('%d-%b-%y')
# unclean_df['Date of Approval for Surrender of membership'] =
unclean_df[
# 'Date of Approval for Surrender of membership'
# ].fillna('')
#
# source_url = '[Link]
of-surrender-members'
# unclean_df.insert(0, 'Sr. No.', range(1, len(unclean_df) + 1))
#
# input_dir = f"input_files/{[Link]}/"
# [Link](input_dir, exist_ok=True)
# timestamp = [Link]().strftime('%Y-%m-%d_%H%M%S')
# output_path = f"{input_dir}/{self.source_name}_{timestamp}.xlsx"
#
# unclean_df.to_excel(output_path, index=False, engine='openpyxl')
#
# [Link](f"Uncleaned data saved as Excel with exact date format
(dd-mmm-yy)",
# file_path=output_path,
# record_count=len(unclean_df),
# columns=list(unclean_df.columns))
#
# except Exception as e:
# [Link](f"Error saving uncleaned data",
# error_details=str(e),
# traceback=traceback.format_exc())
# raise
#
# def closed(self, reason):
# row_count = 0
# if hasattr(self, 'data_df') and isinstance(self.data_df, [Link])
and not self.data_df.empty:
# row_count = len(self.data_df)
#
# if hasattr(self, 'success') and [Link]:
# [Link](f"Spider closed successfully, updating status with
{row_count} records",
# source_id=self.source_id,
# source_description=self.source_description)
# update_scraping_status_on_success(self.source_id, row_count)
#
# if hasattr(self, 'output_file_path') and self.output_file_path:
# [Link](f"Uploading results to S3",
# file_path=self.output_file_path,
# source_id=self.source_id,
# source_description=self.source_description)
# upload_to_s3_and_update_db(self.output_file_path,
self.source_dict, logger)
# else:
# [Link]("No output file path available, skipping S3
upload",
# source_id=self.source_id,
# source_description=self.source_description)
# else:
# error_msg = self.error_message if hasattr(self,
# 'error_message') and
self.error_message else f"Spider closed with reason: {reason}"
# [Link](f"Spider closed with errors, updating failure status:
{error_msg}",
# source_id=self.source_id,
# source_description=self.source_description if
hasattr(self,
#
'source_description') else "unknown")
# update_scraping_status_on_failure(self.source_id, error_msg,
row_count)
#
#
# def run(source_dict=None):
# try:
# source_id = source_dict.get("source_id")
# source_description = source_dict.get("source_description", "List of
Surrendered/Ceased Members")
#
# [Link]("Starting spider execution",
# source_id=source_id,
# source_description=source_description)
#
# try:
# from [Link] import CrawlerProcess
# process = CrawlerProcess(settings)
# [Link](McxSpiderDataSpider, source_dict=source_dict)
# [Link]()
# except Exception as crawler_error:
# [Link](f"CrawlerProcess approach failed:
{str(crawler_error)}. Trying fallback method.",
# error_details=str(crawler_error))
#
# spider = McxSpiderDataSpider(source_dict=source_dict)
# try:
# requests = list(spider.start_requests() or [])
# if requests:
# for request in requests:
# response = None
# if 'response_object' in [Link]:
# response = [Link]['response_object']
# list([Link](response) or [])
# else:
# [Link]("No requests generated from start_requests")
# [Link](reason="finished")
# except Exception as manual_error:
# error_message = f"Error in manual spider execution:
{str(manual_error)}"
# [Link](error_message,
# error_details=str(manual_error),
# traceback=traceback.format_exc())
# raise
#
# [Link]("Spider execution completed successfully")
# return True
# except Exception as e:
# error_message = f"Critical error running spider: {str(e)}"
# [Link](error_message,
# error_details=str(e),
# traceback=traceback.format_exc(),
# source_id=source_id,
# source_description=source_description)
# update_scraping_status_on_failure(source_id, error_message, 0)
# return False
#
#
# if __name__ == '__main__':
# hardcoded_source_dict = {
# "source_id": "mcxindia_spider",
# "id": 77,
# "source_name": "www_mcxindia_com",
# "source_description": "List of Surrendered/Ceased Members",
# "count": 0
# }
# run(hardcoded_source_dict)
import scrapy
import pandas as pd
import re
import os
import traceback
import warnings
import pycountry
from io import BytesIO
from datetime import datetime
from [Link] import urljoin
from [Link] import TextResponse
from src.helpers_scrapy.scrap_logger import ScrapperLogger
from src.helpers_scrapy.global_helpers import update_scraping_status_on_success,
update_scraping_status_on_failure, \
upload_to_s3_and_update_db
from src.helpers_scrapy.proxy_retry_helper import ProxyRetryHelper
[Link]("ignore", category=UserWarning, module="openpyxl")

logger = ScrapperLogger(script_name="mcxindia_spider", source="www_mcxindia_com")

settings = {
'BOT_NAME': 'watchlist_scrapper',
'ROBOTSTXT_OBEY': False,
'CONCURRENT_REQUESTS': 1,
'DOWNLOAD_DELAY': 3,
'COOKIES_ENABLED': True,
'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36
(KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
}

# this library called pycountry provide country names so use it for get country so
same as and also save () sign
VALID_COUNTRIES = {[Link]() for c in [Link]}.union(
{c.alpha_2.lower() for c in [Link]}
)

def clean_text(text):
text_str = str(text)
print(text)
store=text_str.split(' ')
print(store)
text_str = [Link](r'\((?!.*?\)).*', lambda m: [Link](0) + ')', text_str)

text_processed = [Link](
r'\(([^)]*)\)',
lambda m: f'({[Link](1)})' if [Link](1).strip().lower() in
VALID_COUNTRIES else [Link](1),
text_str
)
text_with_spaces = [Link](r'[^\w\s/\'\-&()]', ' ', text_processed)
cleaned_data = [Link](r'\s+', ' ', text_with_spaces).strip()
return cleaned_data if cleaned_data else ""

class DataSpiderSpider([Link]):
name = "mcxindia_spider"
source_name = "www_mcxindia_com"
source_id = None

def __init__(self, source_dict=None, *args, **kwargs):


try:
super(DataSpiderSpider, self).__init__(*args, **kwargs)
if source_dict and "source_id" in source_dict:
self.source_id = source_dict.get("source_id")
self.source_description = source_dict.get("source_description",
"List of Surrendered/Ceased
Members") if source_dict else "List of Surrendered/Ceased Members"
[Link](f"Initializing {[Link]} spider for {self.source_name}",
source_description=self.source_description)
self.proxy_helper = ProxyRetryHelper(logger)
self.data_df = None
self.output_file_path = None
self.source_dict = source_dict
self.error_message = None
[Link] = True

# headers
[Link] = {
'Accept':
'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/
webp,image/apng,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.9',
'Connection': 'keep-alive',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)
AppleWebKit/537.36 (KHTML, like Gecko) Chrome/[Link] Safari/537.36'
}
[Link] = {
'_gid': 'GA1.2.1382501654.1744804578',
'_ga_8BQ43G0902': 'GS1.1.1744804578.1.0.1744804578.0.0.0',
'_ga': 'GA1.1.1289432087.1744804578',
'ASP.NET_SessionId': 'lexvhplwoxq2z1tjh5nukaj2',
}
except Exception as e:
[Link]("CRITICAL ERROR during initialization",
error_details=str(e),
traceback=traceback.format_exc())
[Link] = False
self.error_message = f"Initialization error: {str(e)}"
raise
# this function fetch starting page response and send to parse
def start_requests(self):
try:
req_url = '[Link]
surrender-members'
[Link](f"Starting data extraction process",
url=req_url,
source=self.source_name,
source_description=self.source_description)

if not hasattr(self, 'proxy_helper') or self.proxy_helper is None:


error_message = "proxy_helper is not initialized properly"
[Link](error_message)
[Link] = False
self.error_message = error_message
return

[Link](f"Fetching initial content with proxy retry helper",


url=req_url)

try:
html_content = self.proxy_helper.fetch_with_retries(req_url)
except Exception as proxy_error:
error_message = f"Error during proxy fetch: {str(proxy_error)}"
[Link](error_message,
error_details=str(proxy_error),
traceback=traceback.format_exc())
[Link] = False
self.error_message = error_message
return

if html_content:
# Create a TextResponse object to parse the HTML
response = TextResponse(url=req_url, body=html_content,
encoding='utf-8')
[Link](f"Successfully retrieved content, proceeding to parse",
content_length=len(html_content))

# Create a new request with proper headers


yield [Link](
url=req_url,
callback=[Link],
dont_filter=True,
meta={
'response_object': response,
'original_headers': [Link],
'original_cookies':[Link]
},
headers=[Link],
cookies=[Link])
else:
error_message = f"Failed to retrieve data from {req_url} after all
retry attempts"
[Link](error_message, url=req_url)
[Link] = False
self.error_message = error_message
return []

except Exception as e:
error_message = f"Critical error in start_requests: {str(e)}"
[Link](error_message,
error_details=str(e),
traceback=traceback.format_exc())
[Link] = False
self.error_message = error_message
return []

# this function is fetching excel xpath and send response in extract row data
function
def parse(self, response, **kwargs):
try:
[Link](f"Starting parsing process", url=[Link])

# Get headers from meta


headers = [Link]('original_headers', {})
cookies = [Link]('original_cookies', {})

if 'response_object' in [Link]:
response = [Link]['response_object']
[Link]("Using pre-fetched response from meta")

try:
relative_url = [Link](
'//div[@class="clearfix
mrT20"]//div[@class="main-pdf-box"]/div[@class="pdf-box excel-box"]/p/a/@href'
).get()

# Check if we found the expected data structure


if not relative_url:
[Link]("Excel file link not found in initial parse
attempt",
xpath_used='//div[@class="clearfix
mrT20"]//div[@class="main-pdf-box"]/div[@class="pdf-box excel-box"]/p/a/@href')

try:
[Link]("Attempting retry to fetch content",
url=[Link])
html_content =
self.proxy_helper.fetch_with_retries([Link])

if html_content:
response = TextResponse(url=[Link],
body=html_content, encoding='utf-8')
relative_url = [Link](
'//div[@class="clearfix mrT20"]//div[@class="main-
pdf-box"]/div[@class="pdf-box excel-box"]/p/a/@href'
).get()

if not relative_url:
error_message = "Failed to find Excel file link
even after retry"
[Link](error_message,
xpath_used='//div[@class="clearfix
mrT20"]//div[@class="main-pdf-box"]/div[@class="pdf-box excel-box"]/p/a/@href')
[Link] = False
self.error_message = error_message
return
else:
[Link]("Successfully retrieved Excel link
after retry")
else:
error_message = f"All proxy attempts failed for
{[Link]}"
[Link](error_message, url=[Link])
[Link] = False
self.error_message = error_message
return
except Exception as retry_error:
error_message = f"Error during retry attempt:
{str(retry_error)}"
[Link](error_message,
error_details=str(retry_error),
traceback=traceback.format_exc())
[Link] = False
self.error_message = error_message
return
else:
[Link]("Found Excel file link in initial parse")

# Process the Excel file link


if relative_url:
full_url = urljoin([Link], relative_url)
[Link](f"Found Excel file link, proceeding to download",
excel_url=full_url)

try:
# Create a new request for inner response so need to do
change here for loop of table to scrapy request
yield [Link](
url=full_url,
headers=headers,
cookies=cookies,
callback=self._extract_row_data,
meta={
'original_headers': headers,
'original_cookies':cookies}
)
except Exception as request_error:
error_message = f"Error creating Excel download request:
{str(request_error)}"
[Link](error_message,
error_details=str(request_error),
traceback=traceback.format_exc())
[Link] = False
self.error_message = error_message
return

except Exception as parse_error:


error_message = f"Error while parsing HTML content:
{str(parse_error)}"
[Link](error_message,
error_details=str(parse_error),
traceback=traceback.format_exc())
[Link] = False
self.error_message = error_message

except Exception as e:
error_message = f"Unexpected critical error in parse method: {str(e)}"
[Link](error_message,
error_details=str(e),
traceback=traceback.format_exc())
[Link] = False
self.error_message = error_message

# this function is used for fetching inner data of excel


def _extract_row_data(self, response):
try:
excel_url = getattr(response, 'url', 'URL_NOT_AVAILABLE')
[Link]("Starting Excel data extraction process",
url=excel_url,
source=self.source_name,
source_description=self.source_description)

extracted_data = []
unclean_records = []

try:
xls_file = BytesIO([Link])
xls = [Link](xls_file, engine='openpyxl')

for sheet_name in xls.sheet_names:


try:
[Link](f"Processing sheet: {sheet_name}")
df = [Link](sheet_name)
[Link] = [Link]()
raw_df = [Link]()
raw_df['status'] = sheet_name
unclean_records.extend(raw_df.to_dict('records'))

required_cols = ['Name of the Member', 'Member ID',


'Date of Approval for Surrender of
membership',
'SEBI Reg. No']
for col in required_cols:
if col not in [Link]:
df[col] = ''
if 'Member ID' in [Link]:
df['Member ID'] = df['Member
ID'].astype(str).[Link]()
if 'Date of Approval for Surrender of membership' in
[Link]:
df['Date of Approval for Surrender of membership'] =
pd.to_datetime(
df['Date of Approval for Surrender of membership'],
errors='coerce'
).[Link]('%Y-%m-%d')
df['Date of Approval for Surrender of membership'] =
df[
'Date of Approval for Surrender of
membership'].fillna('')

df['status'] = sheet_name

for _, row in [Link]():


try:
record = {
'Name of the Member': '',
'Member ID': '',
'SEBI Reg. No': '',
'Date of Approval for Surrender of membership':
'',
'status': sheet_name,
'Proprietor': '',
'Additional information': '' ,
'Alias':''}

for field in ['Name of the Member', 'Member ID',


'Date of Approval for Surrender of
membership',
'SEBI Reg. No']:
if field in row:
record[field] = str(row[field]).strip() if
[Link](row[field]) else ''

name = record['Name of the Member']

# Handle Proprietor if present


if 'Proprietor' in name:
proprietor_match = [Link](r'Proprietor (.*?)
(?=\s*$|\s*\(|$)', name)
if proprietor_match:
clean_proprietor =
proprietor_match.group(1)
proprietor = clean_proprietor.replace(')',
'')
record['Proprietor'] = proprietor
name = [Link](f"Proprietor
{proprietor_match.group(1)}", "").strip()
# Clean name and extract additional information
name = [Link](r'\((?!.*?\)).*', lambda m:
[Link](0) + ')', name)

cleaned_name, additional_info, alias =


self.clean_member_name_and_info(name)
additional_info = [Link](r'\(\s*\(', '(',
additional_info)
additional_info = [Link](r'\)\s*\)', ')',
additional_info)
if '()' in additional_info:
additional_info=''

record['Name of the Member'] =


clean_text(cleaned_name)
record['Additional information'] = additional_info
record['Alias'] = alias
extracted_data.append(record)

except Exception as row_error:


[Link](f"Error processing row in sheet
{sheet_name}",
error_details=str(row_error))
continue

except Exception as sheet_error:


[Link](f"Error processing sheet {sheet_name}",
error_details=str(sheet_error),
traceback=traceback.format_exc())
continue

if unclean_records:
unclean_df = [Link](unclean_records)
self._save_uncleaned_data(unclean_df)

if not extracted_data:
error_message = "No data extracted from Excel file"
[Link](error_message, url=excel_url)
[Link] = False
self.error_message = error_message
return

try:
self.data_df = [Link](extracted_data)
self.data_df.insert(0, 'Sr. No.', range(1, len(self.data_df) +
1))
data = self.data_df.to_dict('records')
self.data_df = [Link](data)

self._process_save_data(self.data_df)

[Link] = True
self.error_message = None

[Link](f"Excel data extraction completed successfully",


records_processed=len(self.data_df),
columns=list(self.data_df.columns),
url=excel_url)
except Exception as df_error:
error_message = f"Error creating DataFrame: {str(df_error)}"
[Link](error_message,
error_details=str(df_error),
traceback=traceback.format_exc(),
url=excel_url)
[Link] = False
self.error_message = error_message

except Exception as excel_error:


error_message = f"Error processing Excel file: {str(excel_error)}"
[Link](error_message,
error_details=str(excel_error),
traceback=traceback.format_exc(),
url=excel_url)
[Link] = False
self.error_message = error_message

except Exception as e:
error_message = f"Unexpected critical error in Excel extraction:
{str(e)}"
[Link](error_message,
error_details=str(e),
traceback=traceback.format_exc())
[Link] = False
self.error_message = error_message
raise

def is_as_is_content(self,content):
PROCESS_PHRASES = [
r'Formerly Known as',
r'Formarely Known as',
r'Currently known as',
r'A Division of',
r'Division of'
]
"""
Returns True if content should be kept as-is (doesn't start with any
PROCESS_PHRASES)
"""
content_lower = [Link]().strip()
return not any(
[Link]([Link]([Link]()), content_lower)
for phrase in PROCESS_PHRASES
)

def clean_member_name_and_info(self, name):

alias_phrases = [
r'Formerly Known as (.*?)(?=\s*$|\s*\(|$)',
r'Formarely Known as (.*?)(?=\s*$|\s*\(|$)',
r'Formarely Known as (.*?)(?=\s*$|\s*\(|$)',
r'Currently known as (.*?)(?=\s*$|\s*\(|$)',
]

additional_phrases = [
r'A Division of (.*?)(?=\s*$|\s*\(|$)',
r'Division of (.*?)(?=\s*$|\s*\(|$)',
r'\(A Division of ([^\(\)]*)'

additional_info = ''
alias = ''

if [Link](r'\(.*?Commodity Trading Division.*?\)', name, [Link]):


base_name = [Link]('(')[0].strip()
additional_info = [Link]()
return base_name, additional_info, alias

# Extract all brackets


brackets = [Link](r'\((.*?)\)', name)

# Check for alias patterns first


for b in brackets:
for pattern in alias_phrases:
match = [Link](pattern, b, flags=[Link])
if match:
alias = [Link](0).strip()
name_wo_brackets = [Link](r'\s*\(.*?\)', '', name).strip()
member_name = name_wo_brackets
return member_name, additional_info, alias

# Check for additional info patterns


for b in brackets:
for pattern in additional_phrases:
match = [Link](pattern, b, flags=[Link])
if match:
phrase_content = [Link](1).strip()
name_wo_brackets = [Link](r'\s*\(.*?\)', '', name).strip()
member_name = name_wo_brackets
# Modified to match client's expected format
if "A Division of" in b:
additional_info = f"{member_name} (A Division of
{phrase_content})"
else:
additional_info = f"{member_name} (Division of
{phrase_content})"
return member_name, additional_info, alias

# Handle general brackets not in as_it_is_list


for b in brackets:
if self.is_as_is_content(b): # If content doesn't start with special
phrases
# Keep as-is (original behavior for as_it_is_list)
continue
else:
# Process the content (your existing logic)
name_wo_brackets = [Link](r'\s*\(' + [Link](b) + r'\)', '',
name).strip()
additional_info = f"{name_wo_brackets} ({clean_text(b)})"
return [Link](), additional_info, alias
return [Link](), '', alias

# this function for save excel data which is cleaned


def _process_save_data(self, data):
try:
[Link](f"converting cleaned {len(data)} records to dataframe")
self.data_df=data
source_url = '[Link]
surrender-members'
if 'Sr. No.' in self.data_df.columns:
self.data_df.insert(1, 'Source URL', source_url)
else:
self.data_df.insert(0, 'Source URL', source_url)

output_dir = f"output_files/{[Link]}/"
[Link](output_dir, exist_ok=True)
timestamp = [Link]().strftime('%Y-%m-%d_%H%M%S')
self.output_file_path =
f"{output_dir}/{self.source_name}_{timestamp}.xlsx"

self.data_df.to_excel(self.output_file_path, index=False)

[Link](f"Data successfully saved",


filerecord_count_path=self.output_file_path,
record_count=len(self.data_df),
columns=list(self.data_df.columns),
timestamp=[Link]().strftime('%Y-%m-%d %H:%M:%S'))

except Exception as e:
[Link](f"Error saving data to Excel",
error_details=str(e),
traceback=traceback.format_exc())
raise

# this function store uncleaned data in excel


def _save_uncleaned_data(self, unclean_df):
try:
[Link](f"converting unclean {len(unclean_df)} records to
dataframe")

if 'Sr. No.' in unclean_df.columns:


unclean_df = unclean_df.drop(columns=['Sr. No.'])
if 'Date of Approval for Surrender of membership' in
unclean_df.columns:
unclean_df['Date of Approval for Surrender of membership'] =
pd.to_datetime(
unclean_df['Date of Approval for Surrender of membership'],
errors='coerce',
dayfirst=True
)
unclean_df['Date of Approval for Surrender of membership'] =
unclean_df[
'Date of Approval for Surrender of membership'
].[Link]('%d-%b-%y')
unclean_df['Date of Approval for Surrender of membership'] =
unclean_df[
'Date of Approval for Surrender of membership'
].fillna('')

source_url = '[Link]
surrender-members'
unclean_df.insert(0, 'Sr. No.', range(1, len(unclean_df) + 1))
input_dir = f"input_files/{[Link]}/"
[Link](input_dir, exist_ok=True)
timestamp = [Link]().strftime('%Y-%m-%d_%H%M%S')
output_path = f"{input_dir}/{self.source_name}_{timestamp}.xlsx"

unclean_df.to_excel(output_path, index=False, engine='openpyxl')

[Link](f"Uncleaned data saved as Excel with exact date format (dd-


mmm-yy)",
file_path=output_path,
record_count=len(unclean_df),
columns=list(unclean_df.columns))

except Exception as e:
[Link](f"Error saving uncleaned data",
error_details=str(e),
traceback=traceback.format_exc())
raise

def closed(self, reason):


"""
Method called when spider closes, ideal place to update status and upload
to S3
"""
# Get record count
row_count = 0
if hasattr(self, 'data_df') and isinstance(self.data_df, [Link]) and
not self.data_df.empty:
row_count = len(self.data_df)

# Update the status based on success flag


if hasattr(self, 'success') and [Link]:
[Link](f"Spider closed successfully, updating status with
{row_count} records",
source_id=self.source_id,
source_description=self.source_description)

# First update the scraping status


update_scraping_status_on_success(self.source_id, row_count)

# Then upload to S3 if a file was generated


if hasattr(self, 'output_file_path') and self.output_file_path:
[Link](f"Uploading results to S3",
file_path=self.output_file_path,
source_id=self.source_id,
source_description=self.source_description)
upload_to_s3_and_update_db(self.output_file_path, self.source_dict,
logger)
else:
[Link]("No output file path available, skipping S3 upload",
source_id=self.source_id,
source_description=self.source_description)
else:
error_msg = self.error_message if hasattr(self,
'error_message') and
self.error_message else f"Spider closed with reason: {reason}"
[Link](f"Spider closed with errors, updating failure status:
{error_msg}",
source_id=self.source_id,
source_description=self.source_description if
hasattr(self,

'source_description') else "unknown")


# Use json_serialize for source_dict if needed
print(error_msg)
update_scraping_status_on_failure(self.source_id, error_msg, row_count)

def run(source_dict=None):
"""
Run the spider with the given source dictionary.
This function can be called from other scripts.

Args:
source_dict: Dictionary containing source information

Returns:
True if the spider ran successfully, False otherwise
"""

try:
# Extract source ID from the source_dict
source_id = source_dict.get("source_id")
source_description = source_dict.get("source_description", "BIS Doc
Government Source")

# If source_dict contains datetime objects, ensure they're properly


serialized
[Link]("Starting spider execution",
source_id=source_id,
source_description=source_description)

try:
# Try the CrawlerProcess approach first
from [Link] import CrawlerProcess

# Start a process to run the spider


process = CrawlerProcess(settings)

# Add the spider to the process with the source_dict


[Link](DataSpiderSpider, source_dict=source_dict)

# Run the process (which blocks until the crawl is finished)


[Link]()

except Exception as crawler_error:


[Link](f"CrawlerProcess approach failed: {str(crawler_error)}.
Trying fallback method.",
error_details=str(crawler_error))

# Fallback to manual spider execution


spider = DataSpiderSpider(source_dict=source_dict)

# Manually execute the spider's methods


try:
# Get requests from start_requests
requests = list(spider.start_requests() or [])
if requests:
for request in requests:
# Process each request manually
response = None
if 'response_object' in [Link]:
response = [Link]['response_object']

# Call parse method with the response


list([Link](response) or [])
else:
[Link]("No requests generated from start_requests")

# Manually call the closed method


[Link](reason="finished")

except Exception as manual_error:


error_message = f"Error in manual spider execution:
{str(manual_error)}"
[Link](error_message,
error_details=str(manual_error),
traceback=traceback.format_exc())
raise

[Link]("Spider execution completed successfully")


return True
except Exception as e:
error_message = f"Critical error running spider: {str(e)}"
[Link](error_message,
error_details=str(e),
traceback=traceback.format_exc(),
source_id=source_id,
source_description=source_description)

# Manual status update in case the spider's closed method wasn't called
update_scraping_status_on_failure(source_id, error_message, 0)
return False

if __name__ == '__main__':
hardcoded_source_dict = {
"source_id": "mcxindia_spider",
"id": 77,
"source_name": "www_mcxindia_com",
"source_description": "List of Surrendered/Ceased Members",
"count": 0
}
run(hardcoded_source_dict)

You might also like