0% found this document useful (0 votes)

17 views24 pages

MCX India Spider Data Extraction Script

The document outlines a Scrapy spider implementation for extracting data from the MCX India website, specifically targeting a list of surrendered or ceased members. It includes initialization, request handling, and data extraction from Excel files, with error handling and logging mechanisms throughout the process. The spider uses a proxy retry helper to manage connection issues and processes the data into a structured format for further use.

Uploaded by

dhruvchavda447

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as TXT, PDF, TXT or read online on Scribd

0% found this document useful (0 votes)

17 views24 pages

MCX India Spider Data Extraction Script

Uploaded by

dhruvchavda447

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as TXT, PDF, TXT or read online on Scribd

# import scrapy

# import pandas as pd
# import re
# import os
# import traceback
# import warnings
# from io import BytesIO
# from datetime import datetime
# from [Link] import urljoin
#
# from [Link] import TextResponse
#
# from src.helpers_scrapy.scrap_logger import ScrapperLogger
# from src.helpers_scrapy.global_helpers import update_scraping_status_on_success,
update_scraping_status_on_failure, \
# upload_to_s3_and_update_db
# from src.helpers_scrapy.proxy_retry_helper import ProxyRetryHelper
#
# [Link]("ignore", category=UserWarning, module="openpyxl")
#
# logger = ScrapperLogger(script_name="mcxindia_spider", source="www_mcxindia_com")
#
# settings = {
# 'BOT_NAME': 'w1atchlist_scrapper',
# 'ROBOTSTXT_OBEY': False,
# 'CONCURRENT_REQUESTS': 1,
# 'DOWNLOAD_DELAY': 3,
# 'COOKIES_ENABLED': True,
# 'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36
(KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
# }
#
#
# def clean_text(text):
# text_with_spaces = [Link](r'[^\w\s/\'\-&]', ' ', str(text))
# cleaned_data = [Link](r'\s+', ' ', text_with_spaces).strip()
# return cleaned_data if cleaned_data else ""
# class McxSpiderDataSpider([Link]):
# name = "mcxindia_spider"
# source_name = "www_mcxindia_com"
# source_id = None
#
# def __init__(self, source_dict=None, *args, **kwargs):
# try:
# super(McxSpiderDataSpider, self).__init__(*args, **kwargs)
# if source_dict and "source_id" in source_dict:
# self.source_id = source_dict.get("source_id")
# self.source_description = source_dict.get("source_description",
# "List of Surrendered/Ceased
Members") if source_dict else "List of Surrendered/Ceased Members"
# [Link](f"Initializing {[Link]} spider for
{self.source_name}",
# source_description=self.source_description)
# self.proxy_helper = ProxyRetryHelper(logger)
# self.data_df = None
# self.output_file_path = None
# self.source_dict = source_dict
# self.error_message = None
# [Link] = True
#
# # headers
# [Link] = {
# 'Accept':
'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/
webp,image/apng,*/*;q=0.8',
# 'Accept-Language': 'en-US,en;q=0.9',
# 'Connection': 'keep-alive',
# 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)
AppleWebKit/537.36 (KHTML, like Gecko) Chrome/[Link] Safari/537.36'
# }
# [Link] = {
# '_gid': 'GA1.2.1382501654.1744804578',
# '_ga_8BQ43G0902': 'GS1.1.1744804578.1.0.1744804578.0.0.0',
# '_ga': 'GA1.1.1289432087.1744804578',
# 'ASP.NET_SessionId': 'lexvhplwoxq2z1tjh5nukaj2',
# }
# except Exception as e:
# [Link]("CRITICAL ERROR during initialization",
# error_details=str(e),
# traceback=traceback.format_exc())
# [Link] = False
# self.error_message = f"Initialization error: {str(e)}"
# raise
# # this function fetch starting page response and send to parse
# def start_requests(self):
# try:
# req_url = '[Link]
surrender-members'
# [Link](f"Starting data extraction process",
# url=req_url,
# source=self.source_name,
# source_description=self.source_description)
#
# if not hasattr(self, 'proxy_helper') or self.proxy_helper is None:
# error_message = "proxy_helper is not initialized properly"
# [Link](error_message)
# [Link] = False
# self.error_message = error_message
# return
#
# [Link](f"Fetching initial content with proxy retry helper",
url=req_url)
#
# try:
# html_content = self.proxy_helper.fetch_with_retries(req_url)
# except Exception as proxy_error:
# error_message = f"Error during proxy fetch: {str(proxy_error)}"
# [Link](error_message,
# error_details=str(proxy_error),
# traceback=traceback.format_exc())
# [Link] = False
# self.error_message = error_message
# return
#
# if html_content:
# # Create a TextResponse object to parse the HTML
# response = TextResponse(url=req_url, body=html_content,
encoding='utf-8')
# [Link](f"Successfully retrieved content, proceeding to
parse",
# content_length=len(html_content))
#
# # Create a new request with proper headers
# yield [Link](
# url=req_url,
# callback=[Link],
# dont_filter=True,
# meta={
# 'response_object': response,
# 'original_headers': [Link],
# 'original_cookies':[Link]
# },
# headers=[Link],
# cookies=[Link])
# else:
# error_message = f"Failed to retrieve data from {req_url} after
all retry attempts"
# [Link](error_message, url=req_url)
# [Link] = False
# self.error_message = error_message
# return []
#
# except Exception as e:
# error_message = f"Critical error in start_requests: {str(e)}"
# [Link](error_message,
# error_details=str(e),
# traceback=traceback.format_exc())
# [Link] = False
# self.error_message = error_message
# return []
#
# # this function is fetching excel xpath and send response in extract row data
function
# def parse(self, response, **kwargs):
# try:
# [Link](f"Starting parsing process", url=[Link])
#
# # Get headers from meta
# headers = [Link]('original_headers', {})
# cookies = [Link]('original_cookies', {})
#
# if 'response_object' in [Link]:
# response = [Link]['response_object']
# [Link]("Using pre-fetched response from meta")
#
# try:
# relative_url = [Link](
# '//div[@class="clearfix
mrT20"]//div[@class="main-pdf-box"]/div[@class="pdf-box excel-box"]/p/a/@href'
# ).get()
#
# # Check if we found the expected data structure
# if not relative_url:
# [Link]("Excel file link not found in initial parse
attempt",
# xpath_used='//div[@class="clearfix
mrT20"]//div[@class="main-pdf-box"]/div[@class="pdf-box excel-box"]/p/a/@href')
#
# try:
# [Link]("Attempting retry to fetch content",
url=[Link])
# html_content =
self.proxy_helper.fetch_with_retries([Link])
#
# if html_content:
# response = TextResponse(url=[Link],
body=html_content, encoding='utf-8')
# relative_url = [Link](
# '//div[@class="clearfix
mrT20"]//div[@class="main-pdf-box"]/div[@class="pdf-box excel-box"]/p/a/@href'
# ).get()
#
# if not relative_url:
# error_message = "Failed to find Excel file link
even after retry"
# [Link](error_message,
# xpath_used='//div[@class="clearfix
mrT20"]//div[@class="main-pdf-box"]/div[@class="pdf-box excel-box"]/p/a/@href')
# [Link] = False
# self.error_message = error_message
# return
# else:
# [Link]("Successfully retrieved Excel link
after retry")
# else:
# error_message = f"All proxy attempts failed for
{[Link]}"
# [Link](error_message, url=[Link])
# [Link] = False
# self.error_message = error_message
# return
# except Exception as retry_error:
# error_message = f"Error during retry attempt:
{str(retry_error)}"
# [Link](error_message,
# error_details=str(retry_error),
# traceback=traceback.format_exc())
# [Link] = False
# self.error_message = error_message
# return
# else:
# [Link]("Found Excel file link in initial parse")
#
# # Process the Excel file link
# if relative_url:
# full_url = urljoin([Link], relative_url)
# [Link](f"Found Excel file link, proceeding to download",
# excel_url=full_url)
#
# try:
# # Create a new request with the original headers
# yield [Link](
# url=full_url,
# headers=headers,
# cookies=cookies,
# callback=self._extract_row_data,
# meta={
# 'original_headers': headers,
# 'original_cookies':cookies}
# )
# except Exception as request_error:
# error_message = f"Error creating Excel download request:
{str(request_error)}"
# [Link](error_message,
# error_details=str(request_error),
# traceback=traceback.format_exc())
# [Link] = False
# self.error_message = error_message
# return
#
# except Exception as parse_error:
# error_message = f"Error while parsing HTML content:
{str(parse_error)}"
# [Link](error_message,
# error_details=str(parse_error),
# traceback=traceback.format_exc())
# [Link] = False
# self.error_message = error_message
#
# except Exception as e:
# error_message = f"Unexpected critical error in parse method:
{str(e)}"
# [Link](error_message,
# error_details=str(e),
# traceback=traceback.format_exc())
# [Link] = False
# self.error_message = error_message
# # this function is used for fetch excel inner row data properly and manage
some columns
# def _extract_row_data(self, response):
# try:
# excel_url = getattr(response, 'url', 'URL_NOT_AVAILABLE')
# [Link]("Starting Excel data extraction process",
# url=excel_url,
# source=self.source_name,
# source_description=self.source_description)
#
# extracted_data = []
# unclean_records = []
#
# try:
# xls_file = BytesIO([Link])
# xls = [Link](xls_file, engine='openpyxl')
# # because we have 2 sheets so we need to do like this for get
data one by one
# for sheet_name in xls.sheet_names:
# try:
# [Link](f"Processing sheet: {sheet_name}")
# df = [Link](sheet_name)
# [Link] = [Link]()
# raw_df = [Link]()
# raw_df['status'] = sheet_name
# unclean_records.extend(raw_df.to_dict('records'))
# required_cols = ['Name of the Member', 'Member ID',
# 'Date of Approval for Surrender of
membership',
# 'SEBI Reg. No']
# for col in required_cols:
# if col not in [Link]:
# df[col] = ''
# if 'Member ID' in [Link]:
# df['Member ID'] = df['Member
ID'].astype(str).[Link]()
# # need to do date format like yyyy/mm/dd format
# if 'Date of Approval for Surrender of membership' in
[Link]:
# df['Date of Approval for Surrender of membership'] =
pd.to_datetime(
# df['Date of Approval for Surrender of
membership'], errors='coerce'
# ).[Link]('%Y/%m/%d')
# df['Date of Approval for Surrender of membership'] =
df[
# 'Date of Approval for Surrender of
membership'].fillna('')
#
# df['status'] = sheet_name
#
# for _, row in [Link]():
# try:
# record = {
# 'Name of the Member': '',
# 'Member ID': '',
# 'SEBI Reg. No': '',
# 'Date of Approval for Surrender of
membership': '',
# 'status': sheet_name,
# 'Alias': '',
# 'Proprietor': ''
# }
#
# for field in ['Name of the Member', 'Member ID',
# 'Date of Approval for Surrender of
membership',
# 'SEBI Reg. No']:
# if field in row:
# record[field] = str(row[field]).strip()
if [Link](row[field]) else ''
#
# name = record['Name of the Member']
# # need to check proprietor if getting
# if 'Proprietor' in name:
# proprietor_match = [Link](r'Proprietor
(.*?)(?=\s*$|\s*$|$)', name)
# if proprietor_match:
# record['Proprietor']
=clean_text(proprietor_match.group(1))
# name = [Link](f"Proprietor
{proprietor_match.group(1)}", "").strip()
# # in alias we getting like this data so need to
remove this starting
# alias_phrases = [
# r'Formerly Known as (.*?)(?=\s*$|\s*\(|$)',
# r'Currently known as (.*?)(?=\s*$|\s*\(|$)',
# r'A Division of (.*?)(?=\s*$|\s*\(|$)',
# r'Division of (.*?)(?=\s*$|\s*\(|$)',
# r'Commodity trading division - (.*?)(?=\s*$|\
s*\(|$)',
# r'\((.*?)$'
# ]
#
# alias = None
# for phrase in alias_phrases:
# match = [Link](phrase, name,
[Link])
# if match:
# alias =clean_text([Link](1))
# name = [Link]([Link](0),
"").strip()
# break
# # if found this word then need to ignore this
because this is not Alias here
# if alias:
# ignore_list = ['india', 'plastic', 'p',
'gujarat', 'pvt.', 'pvt', 'i', 'l', 'guj',
# 'Import & Export', 'Agencies',
'HUF']
# if [Link]() not in [[Link]() for x in
ignore_list]:
# record['Alias'] = alias
#
# record['Name of the Member'] =clean_text(name)
# extracted_data.append(record)
#
# except Exception as row_error:
# [Link](f"Error processing row in sheet
{sheet_name}",
# error_details=str(row_error))
# continue
#
# except Exception as sheet_error:
# [Link](f"Error processing sheet {sheet_name}",
# error_details=str(sheet_error),
# traceback=traceback.format_exc())
# continue
#
# if unclean_records:
# unclean_df = [Link](unclean_records)
# self._save_uncleaned_data(unclean_df)
# if not extracted_data:
# error_message = "No data extracted from Excel file"
# [Link](error_message, url=excel_url)
# [Link] = False
# self.error_message = error_message
# return
#
# try:
# self.data_df = [Link](extracted_data)
# self.data_df.insert(0, 'Sr. No.', range(1, len(self.data_df)
+ 1))
# data=self.data_df.to_dict('records')
# self.data_df = [Link](data)
#
# self._process_save_data( self.data_df )
#
# [Link] = True
# self.error_message = None
#
# [Link](f"Excel data extraction completed successfully",
# records_processed=len(self.data_df),
# columns=list(self.data_df.columns),
# url=excel_url)
#
# except Exception as df_error:
# error_message = f"Error creating DataFrame: {str(df_error)}"
# [Link](error_message,
# error_details=str(df_error),
# traceback=traceback.format_exc(),
# url=excel_url)
# [Link] = False
# self.error_message = error_message
#
# except Exception as excel_error:
# error_message = f"Error processing Excel file:
{str(excel_error)}"
# [Link](error_message,
# error_details=str(excel_error),
# traceback=traceback.format_exc(),
# url=excel_url)
# [Link] = False
# self.error_message = error_message
#
# except Exception as e:
# error_message = f"Unexpected critical error in Excel extraction:
{str(e)}"
# [Link](error_message,
# error_details=str(e),
# traceback=traceback.format_exc())
# [Link] = False
# self.error_message = error_message
# raise
#
# # this function for save excel data which is cleaned
# def _process_save_data(self, data):
# try:
# [Link](f"converting cleaned {len(data)} records to dataframe")
# self.data_df=data
# source_url = '[Link]
of-surrender-members'
# if 'Sr. No.' in self.data_df.columns:
# self.data_df.insert(1, 'Source URL', source_url)
# else:
# self.data_df.insert(0, 'Source URL', source_url)
#
# output_dir = f"output_files/{[Link]}/"
# [Link](output_dir, exist_ok=True)
# timestamp = [Link]().strftime('%Y-%m-%d_%H%M%S')
# self.output_file_path =
f"{output_dir}/{self.source_name}_{timestamp}.xlsx"
#
# self.data_df.to_excel(self.output_file_path, index=False)
#
# [Link](f"Data successfully saved",
# filerecord_count_path=self.output_file_path,
# record_count=len(self.data_df),
# columns=list(self.data_df.columns),
# timestamp=[Link]().strftime('%Y-%m-%d %H:%M:%S'))
#
# except Exception as e:
# [Link](f"Error saving data to Excel",
# error_details=str(e),
# traceback=traceback.format_exc())
# raise
# # this function store uncleaned data in excel
# def _save_uncleaned_data(self, unclean_df):
# try:
# [Link](f"converting unclean {len(unclean_df)} records to
dataframe")
#
# if 'Sr. No.' in unclean_df.columns:
# unclean_df = unclean_df.drop(columns=['Sr. No.'])
# if 'Date of Approval for Surrender of membership' in
unclean_df.columns:
# unclean_df['Date of Approval for Surrender of membership'] =
pd.to_datetime(
# unclean_df['Date of Approval for Surrender of membership'],
# errors='coerce',
# dayfirst=True
# )
# unclean_df['Date of Approval for Surrender of membership'] =
unclean_df[
# 'Date of Approval for Surrender of membership'
# ].[Link]('%d-%b-%y')
# unclean_df['Date of Approval for Surrender of membership'] =
unclean_df[
# 'Date of Approval for Surrender of membership'
# ].fillna('')
#
# source_url = '[Link]
of-surrender-members'
# unclean_df.insert(0, 'Sr. No.', range(1, len(unclean_df) + 1))
#
# input_dir = f"input_files/{[Link]}/"
# [Link](input_dir, exist_ok=True)
# timestamp = [Link]().strftime('%Y-%m-%d_%H%M%S')
# output_path = f"{input_dir}/{self.source_name}_{timestamp}.xlsx"
#
# unclean_df.to_excel(output_path, index=False, engine='openpyxl')
#
# [Link](f"Uncleaned data saved as Excel with exact date format
(dd-mmm-yy)",
# file_path=output_path,
# record_count=len(unclean_df),
# columns=list(unclean_df.columns))
#
# except Exception as e:
# [Link](f"Error saving uncleaned data",
# error_details=str(e),
# traceback=traceback.format_exc())
# raise
#
# def closed(self, reason):
# row_count = 0
# if hasattr(self, 'data_df') and isinstance(self.data_df, [Link])
and not self.data_df.empty:
# row_count = len(self.data_df)
#
# if hasattr(self, 'success') and [Link]:
# [Link](f"Spider closed successfully, updating status with
{row_count} records",
# source_id=self.source_id,
# source_description=self.source_description)
# update_scraping_status_on_success(self.source_id, row_count)
#
# if hasattr(self, 'output_file_path') and self.output_file_path:
# [Link](f"Uploading results to S3",
# file_path=self.output_file_path,
# source_id=self.source_id,
# source_description=self.source_description)
# upload_to_s3_and_update_db(self.output_file_path,
self.source_dict, logger)
# else:
# [Link]("No output file path available, skipping S3
upload",
# source_id=self.source_id,
# source_description=self.source_description)
# else:
# error_msg = self.error_message if hasattr(self,
# 'error_message') and
self.error_message else f"Spider closed with reason: {reason}"
# [Link](f"Spider closed with errors, updating failure status:
{error_msg}",
# source_id=self.source_id,
# source_description=self.source_description if
hasattr(self,
#
'source_description') else "unknown")
# update_scraping_status_on_failure(self.source_id, error_msg,
row_count)
#
#
# def run(source_dict=None):
# try:
# source_id = source_dict.get("source_id")
# source_description = source_dict.get("source_description", "List of
Surrendered/Ceased Members")
#
# [Link]("Starting spider execution",
# source_id=source_id,
# source_description=source_description)
#
# try:
# from [Link] import CrawlerProcess
# process = CrawlerProcess(settings)
# [Link](McxSpiderDataSpider, source_dict=source_dict)
# [Link]()
# except Exception as crawler_error:
# [Link](f"CrawlerProcess approach failed:
{str(crawler_error)}. Trying fallback method.",
# error_details=str(crawler_error))
#
# spider = McxSpiderDataSpider(source_dict=source_dict)
# try:
# requests = list(spider.start_requests() or [])
# if requests:
# for request in requests:
# response = None
# if 'response_object' in [Link]:
# response = [Link]['response_object']
# list([Link](response) or [])
# else:
# [Link]("No requests generated from start_requests")
# [Link](reason="finished")
# except Exception as manual_error:
# error_message = f"Error in manual spider execution:
{str(manual_error)}"
# [Link](error_message,
# error_details=str(manual_error),
# traceback=traceback.format_exc())
# raise
#
# [Link]("Spider execution completed successfully")
# return True
# except Exception as e:
# error_message = f"Critical error running spider: {str(e)}"
# [Link](error_message,
# error_details=str(e),
# traceback=traceback.format_exc(),
# source_id=source_id,
# source_description=source_description)
# update_scraping_status_on_failure(source_id, error_message, 0)
# return False
#
#
# if __name__ == '__main__':
# hardcoded_source_dict = {
# "source_id": "mcxindia_spider",
# "id": 77,
# "source_name": "www_mcxindia_com",
# "source_description": "List of Surrendered/Ceased Members",
# "count": 0
# }
# run(hardcoded_source_dict)
import scrapy
import pandas as pd
import re
import os
import traceback
import warnings
import pycountry
from io import BytesIO
from datetime import datetime
from [Link] import urljoin
from [Link] import TextResponse
from src.helpers_scrapy.scrap_logger import ScrapperLogger
from src.helpers_scrapy.global_helpers import update_scraping_status_on_success,
update_scraping_status_on_failure, \
upload_to_s3_and_update_db
from src.helpers_scrapy.proxy_retry_helper import ProxyRetryHelper
[Link]("ignore", category=UserWarning, module="openpyxl")

logger = ScrapperLogger(script_name="mcxindia_spider", source="www_mcxindia_com")

settings = {
'BOT_NAME': 'watchlist_scrapper',
'ROBOTSTXT_OBEY': False,
'CONCURRENT_REQUESTS': 1,
'DOWNLOAD_DELAY': 3,
'COOKIES_ENABLED': True,
'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36
(KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
}

# this library called pycountry provide country names so use it for get country so
same as and also save () sign
VALID_COUNTRIES = {[Link]() for c in [Link]}.union(
{c.alpha_2.lower() for c in [Link]}
)

def clean_text(text):
text_str = str(text)
print(text)
store=text_str.split(' ')
print(store)
text_str = [Link](r'$(?!.*?$).*', lambda m: [Link](0) + ')', text_str)

text_processed = [Link](
r'$([^)]*)$',
lambda m: f'({[Link](1)})' if [Link](1).strip().lower() in
VALID_COUNTRIES else [Link](1),
text_str
)
text_with_spaces = [Link](r'[^\w\s/\'\-&()]', ' ', text_processed)
cleaned_data = [Link](r'\s+', ' ', text_with_spaces).strip()
return cleaned_data if cleaned_data else ""

class DataSpiderSpider([Link]):
name = "mcxindia_spider"
source_name = "www_mcxindia_com"
source_id = None

def init(self, source_dict=None, *args, **kwargs):

try:
super(DataSpiderSpider, self).__init__(*args, **kwargs)
if source_dict and "source_id" in source_dict:
self.source_id = source_dict.get("source_id")
self.source_description = source_dict.get("source_description",
"List of Surrendered/Ceased
Members") if source_dict else "List of Surrendered/Ceased Members"
[Link](f"Initializing {[Link]} spider for {self.source_name}",
source_description=self.source_description)
self.proxy_helper = ProxyRetryHelper(logger)
self.data_df = None
self.output_file_path = None
self.source_dict = source_dict
self.error_message = None
[Link] = True

# headers
[Link] = {
'Accept':
'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/
webp,image/apng,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.9',
'Connection': 'keep-alive',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)
AppleWebKit/537.36 (KHTML, like Gecko) Chrome/[Link] Safari/537.36'
}
[Link] = {
'_gid': 'GA1.2.1382501654.1744804578',
'_ga_8BQ43G0902': 'GS1.1.1744804578.1.0.1744804578.0.0.0',
'_ga': 'GA1.1.1289432087.1744804578',
'ASP.NET_SessionId': 'lexvhplwoxq2z1tjh5nukaj2',
}
except Exception as e:
[Link]("CRITICAL ERROR during initialization",
error_details=str(e),
traceback=traceback.format_exc())
[Link] = False
self.error_message = f"Initialization error: {str(e)}"
raise
# this function fetch starting page response and send to parse
def start_requests(self):
try:
req_url = '[Link]
surrender-members'
[Link](f"Starting data extraction process",
url=req_url,
source=self.source_name,
source_description=self.source_description)

if not hasattr(self, 'proxy_helper') or self.proxy_helper is None:

error_message = "proxy_helper is not initialized properly"
[Link](error_message)
[Link] = False
self.error_message = error_message
return

[Link](f"Fetching initial content with proxy retry helper",

url=req_url)

try:
html_content = self.proxy_helper.fetch_with_retries(req_url)
except Exception as proxy_error:
error_message = f"Error during proxy fetch: {str(proxy_error)}"
[Link](error_message,
error_details=str(proxy_error),
traceback=traceback.format_exc())
[Link] = False
self.error_message = error_message
return

if html_content:
# Create a TextResponse object to parse the HTML
response = TextResponse(url=req_url, body=html_content,
encoding='utf-8')
[Link](f"Successfully retrieved content, proceeding to parse",
content_length=len(html_content))

# Create a new request with proper headers

yield [Link](
url=req_url,
callback=[Link],
dont_filter=True,
meta={
'response_object': response,
'original_headers': [Link],
'original_cookies':[Link]
},
headers=[Link],
cookies=[Link])
else:
error_message = f"Failed to retrieve data from {req_url} after all
retry attempts"
[Link](error_message, url=req_url)
[Link] = False
self.error_message = error_message
return []

except Exception as e:
error_message = f"Critical error in start_requests: {str(e)}"
[Link](error_message,
error_details=str(e),
traceback=traceback.format_exc())
[Link] = False
self.error_message = error_message
return []

# this function is fetching excel xpath and send response in extract row data
function
def parse(self, response, **kwargs):
try:
[Link](f"Starting parsing process", url=[Link])

# Get headers from meta

headers = [Link]('original_headers', {})
cookies = [Link]('original_cookies', {})

if 'response_object' in [Link]:
response = [Link]['response_object']
[Link]("Using pre-fetched response from meta")

try:
relative_url = [Link](
'//div[@class="clearfix
mrT20"]//div[@class="main-pdf-box"]/div[@class="pdf-box excel-box"]/p/a/@href'
).get()

# Check if we found the expected data structure

if not relative_url:
[Link]("Excel file link not found in initial parse
attempt",
xpath_used='//div[@class="clearfix
mrT20"]//div[@class="main-pdf-box"]/div[@class="pdf-box excel-box"]/p/a/@href')

try:
[Link]("Attempting retry to fetch content",
url=[Link])
html_content =
self.proxy_helper.fetch_with_retries([Link])

if html_content:
response = TextResponse(url=[Link],
body=html_content, encoding='utf-8')
relative_url = [Link](
'//div[@class="clearfix mrT20"]//div[@class="main-
pdf-box"]/div[@class="pdf-box excel-box"]/p/a/@href'
).get()

if not relative_url:
error_message = "Failed to find Excel file link
even after retry"
[Link](error_message,
xpath_used='//div[@class="clearfix
mrT20"]//div[@class="main-pdf-box"]/div[@class="pdf-box excel-box"]/p/a/@href')
[Link] = False
self.error_message = error_message
return
else:
[Link]("Successfully retrieved Excel link
after retry")
else:
error_message = f"All proxy attempts failed for
{[Link]}"
[Link](error_message, url=[Link])
[Link] = False
self.error_message = error_message
return
except Exception as retry_error:
error_message = f"Error during retry attempt:
{str(retry_error)}"
[Link](error_message,
error_details=str(retry_error),
traceback=traceback.format_exc())
[Link] = False
self.error_message = error_message
return
else:
[Link]("Found Excel file link in initial parse")

# Process the Excel file link

if relative_url:
full_url = urljoin([Link], relative_url)
[Link](f"Found Excel file link, proceeding to download",
excel_url=full_url)

try:
# Create a new request for inner response so need to do
change here for loop of table to scrapy request
yield [Link](
url=full_url,
headers=headers,
cookies=cookies,
callback=self._extract_row_data,
meta={
'original_headers': headers,
'original_cookies':cookies}
)
except Exception as request_error:
error_message = f"Error creating Excel download request:
{str(request_error)}"
[Link](error_message,
error_details=str(request_error),
traceback=traceback.format_exc())
[Link] = False
self.error_message = error_message
return

except Exception as parse_error:

error_message = f"Error while parsing HTML content:
{str(parse_error)}"
[Link](error_message,
error_details=str(parse_error),
traceback=traceback.format_exc())
[Link] = False
self.error_message = error_message

except Exception as e:
error_message = f"Unexpected critical error in parse method: {str(e)}"
[Link](error_message,
error_details=str(e),
traceback=traceback.format_exc())
[Link] = False
self.error_message = error_message

# this function is used for fetching inner data of excel

def _extract_row_data(self, response):
try:
excel_url = getattr(response, 'url', 'URL_NOT_AVAILABLE')
[Link]("Starting Excel data extraction process",
url=excel_url,
source=self.source_name,
source_description=self.source_description)

extracted_data = []
unclean_records = []

try:
xls_file = BytesIO([Link])
xls = [Link](xls_file, engine='openpyxl')

for sheet_name in xls.sheet_names:

try:
[Link](f"Processing sheet: {sheet_name}")
df = [Link](sheet_name)
[Link] = [Link]()
raw_df = [Link]()
raw_df['status'] = sheet_name
unclean_records.extend(raw_df.to_dict('records'))

required_cols = ['Name of the Member', 'Member ID',

'Date of Approval for Surrender of
membership',
'SEBI Reg. No']
for col in required_cols:
if col not in [Link]:
df[col] = ''
if 'Member ID' in [Link]:
df['Member ID'] = df['Member
ID'].astype(str).[Link]()
if 'Date of Approval for Surrender of membership' in
[Link]:
df['Date of Approval for Surrender of membership'] =
pd.to_datetime(
df['Date of Approval for Surrender of membership'],
errors='coerce'
).[Link]('%Y-%m-%d')
df['Date of Approval for Surrender of membership'] =
df[
'Date of Approval for Surrender of
membership'].fillna('')

df['status'] = sheet_name

for field in ['Name of the Member', 'Member ID',

'Date of Approval for Surrender of
membership',
'SEBI Reg. No']:
if field in row:
record[field] = str(row[field]).strip() if
[Link](row[field]) else ''

name = record['Name of the Member']

# Handle Proprietor if present

if 'Proprietor' in name:
proprietor_match = [Link](r'Proprietor (.*?)
(?=\s*$|\s*$|$)', name)
if proprietor_match:
clean_proprietor =
proprietor_match.group(1)
proprietor = clean_proprietor.replace(')',
'')
record['Proprietor'] = proprietor
name = [Link](f"Proprietor
{proprietor_match.group(1)}", "").strip()
# Clean name and extract additional information
name = [Link](r'\((?!.*?$).*', lambda m:
[Link](0) + ')', name)

cleaned_name, additional_info, alias =

self.clean_member_name_and_info(name)
additional_info = [Link](r'$\s*\(', '(',
additional_info)
additional_info = [Link](r'$\s*\)', ')',
additional_info)
if '()' in additional_info:
additional_info=''

record['Name of the Member'] =

clean_text(cleaned_name)
record['Additional information'] = additional_info
record['Alias'] = alias
extracted_data.append(record)

except Exception as row_error:

[Link](f"Error processing row in sheet
{sheet_name}",
error_details=str(row_error))
continue

except Exception as sheet_error:

[Link](f"Error processing sheet {sheet_name}",
error_details=str(sheet_error),
traceback=traceback.format_exc())
continue

if unclean_records:
unclean_df = [Link](unclean_records)
self._save_uncleaned_data(unclean_df)

if not extracted_data:
error_message = "No data extracted from Excel file"
[Link](error_message, url=excel_url)
[Link] = False
self.error_message = error_message
return

try:
self.data_df = [Link](extracted_data)
self.data_df.insert(0, 'Sr. No.', range(1, len(self.data_df) +
1))
data = self.data_df.to_dict('records')
self.data_df = [Link](data)

self._process_save_data(self.data_df)

[Link] = True
self.error_message = None

except Exception as excel_error:

error_message = f"Error processing Excel file: {str(excel_error)}"
[Link](error_message,
error_details=str(excel_error),
traceback=traceback.format_exc(),
url=excel_url)
[Link] = False
self.error_message = error_message

except Exception as e:
error_message = f"Unexpected critical error in Excel extraction:
{str(e)}"
[Link](error_message,
error_details=str(e),
traceback=traceback.format_exc())
[Link] = False
self.error_message = error_message
raise

def is_as_is_content(self,content):
PROCESS_PHRASES = [
r'Formerly Known as',
r'Formarely Known as',
r'Currently known as',
r'A Division of',
r'Division of'
]
"""
Returns True if content should be kept as-is (doesn't start with any
PROCESS_PHRASES)
"""
content_lower = [Link]().strip()
return not any(
[Link]([Link]([Link]()), content_lower)
for phrase in PROCESS_PHRASES
)

def clean_member_name_and_info(self, name):

alias_phrases = [
r'Formerly Known as (.*?)(?=\s*$|\s*\(|$)',
r'Formarely Known as (.*?)(?=\s*$|\s*\(|$)',
r'Formarely Known as (.*?)(?=\s*$|\s*\(|$)',
r'Currently known as (.*?)(?=\s*$|\s*\(|$)',
]

additional_phrases = [
r'A Division of (.*?)(?=\s*$|\s*$|$)',
r'Division of (.*?)(?=\s*$|\s*\(|$)',
r'\(A Division of ([^\($]*)'

additional_info = ''
alias = ''

if [Link](r'$.?Commodity Trading Division.?$', name, [Link]):

base_name = [Link]('(')[0].strip()
additional_info = [Link]()
return base_name, additional_info, alias

# Extract all brackets

brackets = [Link](r'$(.*?)$', name)

# Check for alias patterns first

for b in brackets:
for pattern in alias_phrases:
match = [Link](pattern, b, flags=[Link])
if match:
alias = [Link](0).strip()
name_wo_brackets = [Link](r'\s*$.*?$', '', name).strip()
member_name = name_wo_brackets
return member_name, additional_info, alias

# Check for additional info patterns

for b in brackets:
for pattern in additional_phrases:
match = [Link](pattern, b, flags=[Link])
if match:
phrase_content = [Link](1).strip()
name_wo_brackets = [Link](r'\s*$.*?$', '', name).strip()
member_name = name_wo_brackets
# Modified to match client's expected format
if "A Division of" in b:
additional_info = f"{member_name} (A Division of
{phrase_content})"
else:
additional_info = f"{member_name} (Division of
{phrase_content})"
return member_name, additional_info, alias

# Handle general brackets not in as_it_is_list

for b in brackets:
if self.is_as_is_content(b): # If content doesn't start with special
phrases
# Keep as-is (original behavior for as_it_is_list)
continue
else:
# Process the content (your existing logic)
name_wo_brackets = [Link](r'\s*$' + [Link](b) + r'$', '',
name).strip()
additional_info = f"{name_wo_brackets} ({clean_text(b)})"
return [Link](), additional_info, alias
return [Link](), '', alias

# this function for save excel data which is cleaned

def _process_save_data(self, data):
try:
[Link](f"converting cleaned {len(data)} records to dataframe")
self.data_df=data
source_url = '[Link]
surrender-members'
if 'Sr. No.' in self.data_df.columns:
self.data_df.insert(1, 'Source URL', source_url)
else:
self.data_df.insert(0, 'Source URL', source_url)

output_dir = f"output_files/{[Link]}/"
[Link](output_dir, exist_ok=True)
timestamp = [Link]().strftime('%Y-%m-%d_%H%M%S')
self.output_file_path =
f"{output_dir}/{self.source_name}_{timestamp}.xlsx"

self.data_df.to_excel(self.output_file_path, index=False)

[Link](f"Data successfully saved",

filerecord_count_path=self.output_file_path,
record_count=len(self.data_df),
columns=list(self.data_df.columns),
timestamp=[Link]().strftime('%Y-%m-%d %H:%M:%S'))

except Exception as e:
[Link](f"Error saving data to Excel",
error_details=str(e),
traceback=traceback.format_exc())
raise

# this function store uncleaned data in excel

def _save_uncleaned_data(self, unclean_df):
try:
[Link](f"converting unclean {len(unclean_df)} records to
dataframe")

if 'Sr. No.' in unclean_df.columns:

unclean_df = unclean_df.drop(columns=['Sr. No.'])
if 'Date of Approval for Surrender of membership' in
unclean_df.columns:
unclean_df['Date of Approval for Surrender of membership'] =
pd.to_datetime(
unclean_df['Date of Approval for Surrender of membership'],
errors='coerce',
dayfirst=True
)
unclean_df['Date of Approval for Surrender of membership'] =
unclean_df[
'Date of Approval for Surrender of membership'
].[Link]('%d-%b-%y')
unclean_df['Date of Approval for Surrender of membership'] =
unclean_df[
'Date of Approval for Surrender of membership'
].fillna('')

source_url = '[Link]
surrender-members'
unclean_df.insert(0, 'Sr. No.', range(1, len(unclean_df) + 1))
input_dir = f"input_files/{[Link]}/"
[Link](input_dir, exist_ok=True)
timestamp = [Link]().strftime('%Y-%m-%d_%H%M%S')
output_path = f"{input_dir}/{self.source_name}_{timestamp}.xlsx"

unclean_df.to_excel(output_path, index=False, engine='openpyxl')

[Link](f"Uncleaned data saved as Excel with exact date format (dd-

mmm-yy)",
file_path=output_path,
record_count=len(unclean_df),
columns=list(unclean_df.columns))

except Exception as e:
[Link](f"Error saving uncleaned data",
error_details=str(e),
traceback=traceback.format_exc())
raise

def closed(self, reason):

"""
Method called when spider closes, ideal place to update status and upload
to S3
"""
# Get record count
row_count = 0
if hasattr(self, 'data_df') and isinstance(self.data_df, [Link]) and
not self.data_df.empty:
row_count = len(self.data_df)

# Update the status based on success flag

if hasattr(self, 'success') and [Link]:
[Link](f"Spider closed successfully, updating status with
{row_count} records",
source_id=self.source_id,
source_description=self.source_description)

# First update the scraping status

update_scraping_status_on_success(self.source_id, row_count)

# Then upload to S3 if a file was generated

if hasattr(self, 'output_file_path') and self.output_file_path:
[Link](f"Uploading results to S3",
file_path=self.output_file_path,
source_id=self.source_id,
source_description=self.source_description)
upload_to_s3_and_update_db(self.output_file_path, self.source_dict,
logger)
else:
[Link]("No output file path available, skipping S3 upload",
source_id=self.source_id,
source_description=self.source_description)
else:
error_msg = self.error_message if hasattr(self,
'error_message') and
self.error_message else f"Spider closed with reason: {reason}"
[Link](f"Spider closed with errors, updating failure status:
{error_msg}",
source_id=self.source_id,
source_description=self.source_description if
hasattr(self,

'source_description') else "unknown")

# Use json_serialize for source_dict if needed
print(error_msg)
update_scraping_status_on_failure(self.source_id, error_msg, row_count)

def run(source_dict=None):
"""
Run the spider with the given source dictionary.
This function can be called from other scripts.

Args:
source_dict: Dictionary containing source information

Returns:
True if the spider ran successfully, False otherwise
"""

try:
# Extract source ID from the source_dict
source_id = source_dict.get("source_id")
source_description = source_dict.get("source_description", "BIS Doc
Government Source")

# If source_dict contains datetime objects, ensure they're properly

serialized
[Link]("Starting spider execution",
source_id=source_id,
source_description=source_description)

try:
# Try the CrawlerProcess approach first
from [Link] import CrawlerProcess

# Start a process to run the spider

process = CrawlerProcess(settings)

# Add the spider to the process with the source_dict

[Link](DataSpiderSpider, source_dict=source_dict)

# Run the process (which blocks until the crawl is finished)

[Link]()

except Exception as crawler_error:

[Link](f"CrawlerProcess approach failed: {str(crawler_error)}.
Trying fallback method.",
error_details=str(crawler_error))

# Fallback to manual spider execution

spider = DataSpiderSpider(source_dict=source_dict)

# Manually execute the spider's methods

try:
# Get requests from start_requests
requests = list(spider.start_requests() or [])
if requests:
for request in requests:
# Process each request manually
response = None
if 'response_object' in [Link]:
response = [Link]['response_object']

# Call parse method with the response

list([Link](response) or [])
else:
[Link]("No requests generated from start_requests")

# Manually call the closed method

[Link](reason="finished")

except Exception as manual_error:

error_message = f"Error in manual spider execution:
{str(manual_error)}"
[Link](error_message,
error_details=str(manual_error),
traceback=traceback.format_exc())
raise

# Manual status update in case the spider's closed method wasn't called
update_scraping_status_on_failure(source_id, error_message, 0)
return False

if __name__ == '__main__':
hardcoded_source_dict = {
"source_id": "mcxindia_spider",
"id": 77,
"source_name": "www_mcxindia_com",
"source_description": "List of Surrendered/Ceased Members",
"count": 0
}
run(hardcoded_source_dict)

MCX India Membership Data Scraper
No ratings yet
MCX India Membership Data Scraper
17 pages
MCX India Spider Data Extraction Script
No ratings yet
MCX India Spider Data Extraction Script
13 pages
MCX India Membership Data Scraper
No ratings yet
MCX India Membership Data Scraper
13 pages
MCX India Membership Data Scraper
No ratings yet
MCX India Membership Data Scraper
13 pages
API Data Fetching and Processing Script
No ratings yet
API Data Fetching and Processing Script
1 page
Web Scraping Quick Start Guide
No ratings yet
Web Scraping Quick Start Guide
7 pages
API Request Handling in Python
No ratings yet
API Request Handling in Python
1 page
Boto SES + Python Twisted GitHub
No ratings yet
Boto SES + Python Twisted GitHub
20 pages
Paine Lip TV Hls
No ratings yet
Paine Lip TV Hls
18 pages
Scrapy Spider for CMS Violation Data
No ratings yet
Scrapy Spider for CMS Violation Data
10 pages
SEO Review Tools API Integration
No ratings yet
SEO Review Tools API Integration
15 pages
Chrome Media Server v14
No ratings yet
Chrome Media Server v14
26 pages
Hls Tester
No ratings yet
Hls Tester
5 pages
Saudi Real Estate Scraper Tool
No ratings yet
Saudi Real Estate Scraper Tool
23 pages
Hybrid Web Scraping Techniques Overview
No ratings yet
Hybrid Web Scraping Techniques Overview
8 pages
Flask API for Product Recommendations
No ratings yet
Flask API for Product Recommendations
26 pages
Advanced Techniques for Bypassing Anti-Scraping
No ratings yet
Advanced Techniques for Bypassing Anti-Scraping
4 pages
Book API Client Implementation
No ratings yet
Book API Client Implementation
3 pages
Supercell Hotmail - Py
No ratings yet
Supercell Hotmail - Py
13 pages
Pactice 1
No ratings yet
Pactice 1
11 pages
Practice 2
No ratings yet
Practice 2
11 pages
Practice 3
No ratings yet
Practice 3
11 pages
Practice 3
No ratings yet
Practice 3
11 pages
Pactice 1
No ratings yet
Pactice 1
11 pages
Web Scraping Cheat Sheet Guide
No ratings yet
Web Scraping Cheat Sheet Guide
10 pages
Scrapy: Key Concepts and Workflow Guide
No ratings yet
Scrapy: Key Concepts and Workflow Guide
3 pages
Mastering HTTP Requests with Python
No ratings yet
Mastering HTTP Requests with Python
10 pages
Simple HTTP Server in Python
No ratings yet
Simple HTTP Server in Python
1 page
Pinterest Video Creation Pipeline
No ratings yet
Pinterest Video Creation Pipeline
42 pages
Ethical Web Scraping for Public Data
No ratings yet
Ethical Web Scraping for Public Data
1 page
Deriv API Python Integration Guide
No ratings yet
Deriv API Python Integration Guide
2 pages
VK User Parser with Proxy Support
No ratings yet
VK User Parser with Proxy Support
6 pages
S8 Create An API With The Following Endpoints POST Student GET S
No ratings yet
S8 Create An API With The Following Endpoints POST Student GET S
34 pages
S8 Create An API With The Following Endpoints POST Student GET S
No ratings yet
S8 Create An API With The Following Endpoints POST Student GET S
34 pages
Ex 12345
No ratings yet
Ex 12345
5 pages
Company Data Extraction Tool
No ratings yet
Company Data Extraction Tool
7 pages
Fetcher - Py v3.2 Threading Optimized
No ratings yet
Fetcher - Py v3.2 Threading Optimized
9 pages
Team Report Building A RAGfor Refugee Support
No ratings yet
Team Report Building A RAGfor Refugee Support
11 pages
Web Crawling with Scrapy Basics
No ratings yet
Web Crawling with Scrapy Basics
77 pages
Black Duck Hub API Methods Guide
No ratings yet
Black Duck Hub API Methods Guide
2 pages
API Projects: Weather, Crypto, COVID, Stocks
No ratings yet
API Projects: Weather, Crypto, COVID, Stocks
10 pages
URL Testing with Python Requests
No ratings yet
URL Testing with Python Requests
5 pages
Web Log Data Cleaning Guide
No ratings yet
Web Log Data Cleaning Guide
3 pages
Serverless URL Shortener - AWS Console Step-By-Step Guide
No ratings yet
Serverless URL Shortener - AWS Console Step-By-Step Guide
32 pages
Stalker Portal API Integration Guide
No ratings yet
Stalker Portal API Integration Guide
24 pages
Scrapy Spider for NHTSA Violation Data
No ratings yet
Scrapy Spider for NHTSA Violation Data
22 pages
Scrapy Spider for NHTSA Violation Data
No ratings yet
Scrapy Spider for NHTSA Violation Data
22 pages
Download Files from Websites
No ratings yet
Download Files from Websites
3 pages
Email Domain Environment Checker
No ratings yet
Email Domain Environment Checker
2 pages
Scraping DOJ Violation Data with Scrapy
No ratings yet
Scraping DOJ Violation Data with Scrapy
12 pages
Making Web Requests with Python
No ratings yet
Making Web Requests with Python
2 pages
Flask Pet Store with Adoption Stats
No ratings yet
Flask Pet Store with Adoption Stats
4 pages
Stock Market Trading System Project
No ratings yet
Stock Market Trading System Project
55 pages
Making Web Requests with Python
No ratings yet
Making Web Requests with Python
2 pages
Flask API for Airtime Loading
No ratings yet
Flask API for Airtime Loading
4 pages
Xolo Sniper Bot for Roblox Assets
No ratings yet
Xolo Sniper Bot for Roblox Assets
9 pages
Celery Best Practices for Django Tasks
No ratings yet
Celery Best Practices for Django Tasks
5 pages
Debugging Techniques for Scrapy Spiders
No ratings yet
Debugging Techniques for Scrapy Spiders
4 pages
Subdomain Enumeration Tool in Python
No ratings yet
Subdomain Enumeration Tool in Python
11 pages
57 Changes Require
No ratings yet
57 Changes Require
1 page
Scrapy Spider for CMS Violation Data
No ratings yet
Scrapy Spider for CMS Violation Data
10 pages
Scrapy Spider for EEOC Violation Data
No ratings yet
Scrapy Spider for EEOC Violation Data
10 pages
Scrapy Spider for DOJ Civil Violations
No ratings yet
Scrapy Spider for DOJ Civil Violations
8 pages
OFAC Violation Data Scraper Script
No ratings yet
OFAC Violation Data Scraper Script
7 pages
Gov Process
No ratings yet
Gov Process
1 page
57 Changes Require
No ratings yet
57 Changes Require
1 page
77 Changes Need
No ratings yet
77 Changes Need
1 page
Excel Data Extraction Process
No ratings yet
Excel Data Extraction Process
4 pages
Business Name Variations and Aliases
No ratings yet
Business Name Variations and Aliases
1 page
Vendor Compliance and Watchlist Data
No ratings yet
Vendor Compliance and Watchlist Data
23 pages
Hotel Management System Requirements
No ratings yet
Hotel Management System Requirements
3 pages
Hafıza Testi Senaryoları ve Durumları
100% (1)
Hafıza Testi Senaryoları ve Durumları
54 pages
Facebook Application Test Cases Summary
No ratings yet
Facebook Application Test Cases Summary
3 pages
Excel Data Validation Guide
No ratings yet
Excel Data Validation Guide
5 pages
CSPRO Data Entry Application Guide
No ratings yet
CSPRO Data Entry Application Guide
22 pages
Daily Testing Report Summary
No ratings yet
Daily Testing Report Summary
59 pages
Login Page Test Cases Overview
No ratings yet
Login Page Test Cases Overview
4 pages
Fault Finding and Troubleshooting Guide
No ratings yet
Fault Finding and Troubleshooting Guide
40 pages
U-WAM System Error Messages Guide
No ratings yet
U-WAM System Error Messages Guide
20 pages
3900 Series Base Station LMT User Guide (V100R009C00 11) (PDF) - en
67% (3)
3900 Series Base Station LMT User Guide (V100R009C00 11) (PDF) - en
263 pages
NET-i Viewer User Manual v1.34
No ratings yet
NET-i Viewer User Manual v1.34
89 pages
JPetStore Test Cases Overview
75% (8)
JPetStore Test Cases Overview
5 pages
User Account Management Test Cases
No ratings yet
User Account Management Test Cases
8 pages
Flight Reservation System Overview
No ratings yet
Flight Reservation System Overview
3 pages
Kodak DryView 8200 Error Codes Guide
No ratings yet
Kodak DryView 8200 Error Codes Guide
16 pages
Phenotype API Error Log Analysis
No ratings yet
Phenotype API Error Log Analysis
19 pages
Fort App Login and Registration Tests
No ratings yet
Fort App Login and Registration Tests
6 pages
Effective UI Design: Error Messages & Color
No ratings yet
Effective UI Design: Error Messages & Color
22 pages
IRDA Broker Registration Manual
No ratings yet
IRDA Broker Registration Manual
181 pages
FinalProFinalBirtukan Group Ad Haf
No ratings yet
FinalProFinalBirtukan Group Ad Haf
57 pages
UML for Library Management System
85% (89)
UML for Library Management System
27 pages
IRDA Surveyor License Renewal Manual
No ratings yet
IRDA Surveyor License Renewal Manual
140 pages
Hägglunds Spider2 Control System Manual
No ratings yet
Hägglunds Spider2 Control System Manual
68 pages
User Interface Design in OOS Engineering
No ratings yet
User Interface Design in OOS Engineering
28 pages
VITEK2 COMPACT Error Codes Guide
No ratings yet
VITEK2 COMPACT Error Codes Guide
18 pages
Excel Data Validation Guide
No ratings yet
Excel Data Validation Guide
5 pages
Expense Tracker System Flow Visualization
No ratings yet
Expense Tracker System Flow Visualization
14 pages
Dfield 81
No ratings yet
Dfield 81
78 pages
SAP Purchase Requisition Upload Guide
No ratings yet
SAP Purchase Requisition Upload Guide
12 pages
Excel Data Management: Sorting & Validation
No ratings yet
Excel Data Management: Sorting & Validation
3 pages

MCX India Spider Data Extraction Script

Uploaded by

MCX India Spider Data Extraction Script

Uploaded by

# import scrapy

logger = ScrapperLogger(script_name="mcxindia_spider", source="www_mcxindia_com")

def __init__(self, source_dict=None, *args, **kwargs):

if not hasattr(self, 'proxy_helper') or self.proxy_helper is None:

[Link](f"Fetching initial content with proxy retry helper",

# Create a new request with proper headers

# Get headers from meta

# Check if we found the expected data structure

# Process the Excel file link

except Exception as parse_error:

# this function is used for fetching inner data of excel

for sheet_name in xls.sheet_names:

required_cols = ['Name of the Member', 'Member ID',

for _, row in [Link]():

for field in ['Name of the Member', 'Member ID',

name = record['Name of the Member']

# Handle Proprietor if present

cleaned_name, additional_info, alias =

record['Name of the Member'] =

except Exception as row_error:

except Exception as sheet_error:

[Link](f"Excel data extraction completed successfully",

except Exception as excel_error:

def clean_member_name_and_info(self, name):

if [Link](r'\(.*?Commodity Trading Division.*?\)', name, [Link]):

# Extract all brackets

# Check for alias patterns first

# Check for additional info patterns

# Handle general brackets not in as_it_is_list

# this function for save excel data which is cleaned

[Link](f"Data successfully saved",

# this function store uncleaned data in excel

if 'Sr. No.' in unclean_df.columns:

unclean_df.to_excel(output_path, index=False, engine='openpyxl')

[Link](f"Uncleaned data saved as Excel with exact date format (dd-

def closed(self, reason):

# Update the status based on success flag

# First update the scraping status

# Then upload to S3 if a file was generated

'source_description') else "unknown")

# If source_dict contains datetime objects, ensure they're properly

# Start a process to run the spider

# Add the spider to the process with the source_dict

# Run the process (which blocks until the crawl is finished)

except Exception as crawler_error:

# Fallback to manual spider execution

# Manually execute the spider's methods

# Call parse method with the response

# Manually call the closed method

except Exception as manual_error:

[Link]("Spider execution completed successfully")

You might also like

def init(self, source_dict=None, *args, **kwargs):

if [Link](r'\(.?Commodity Trading Division.?\)', name, [Link]):