0% found this document useful (0 votes)

8 views22 pages

Scrapy Spider for NHTSA Violation Data

The document outlines a Python Scrapy spider designed to scrape violation data from the Good Jobs First website, specifically targeting NHTSA-related entries. It includes configurations for session management, custom headers, and methods for handling pagination and data extraction. The spider processes company names, extracts relevant details, and stores the data for export.

Uploaded by

dhruvchavda447

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as TXT, PDF, TXT or read online on Scribd

0% found this document useful (0 votes)

8 views22 pages

Scrapy Spider for NHTSA Violation Data

Uploaded by

dhruvchavda447

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as TXT, PDF, TXT or read online on Scribd

# import random

# import re
# import os
# import string
# import scrapy
# import pandas as pd
# from datetime import datetime
# from [Link] import execute
#
#
# # Spider class for scraping violation data from Good Jobs First (OFAC-related
entries)
# class Act115Spider([Link]):
# name = "NHTSA"
#
# # Preset cookies required for session management and access
# cookies = {
# '_fbp': 'fb.1.1747201269861.315460754820040010',
# '_gid': 'GA1.2.38545643.1747749932',
# '_ga_Q4G4E8KT5J': 'GS2.1.s1747827357$o2$g1$t1747827638$j0$l0$h0',
# 'PHPSESSID': '454b72dc00e908524384cdb37fddb976',
# '_gat_UA-21812781-2': '1',
# '_ga_9VW1HCFL7C': 'GS2.1.s1747997430$o42$g1$t1747999957$j0$l0$h0',
# '_ga': 'GA1.1.1470230669.1747201268',
# }
#
# headers = {
# 'accept':
'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/
webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
# 'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8,hi;q=0.7',
# 'cache-control': 'no-cache',
# 'pragma': 'no-cache',
# 'priority': 'u=0, i',
# 'sec-ch-ua': '"Chromium";v="136", "Google Chrome";v="136",
"Not.A/Brand";v="99"',
# 'sec-ch-ua-mobile': '?0',
# 'sec-ch-ua-platform': '"Windows"',
# 'sec-fetch-dest': 'document',
# 'sec-fetch-mode': 'navigate',
# 'sec-fetch-site': 'none',
# 'sec-fetch-user': '?1',
# 'upgrade-insecure-requests': '1',
# 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)
AppleWebKit/537.36 (KHTML, like Gecko) Chrome/[Link] Safari/537.36',
# # 'cookie': '_fbp=fb.1.1747201269861.315460754820040010;
_gid=GA1.2.38545643.1747749932;
_ga_Q4G4E8KT5J=GS2.1.s1747827357$o2$g1$t1747827638$j0$l0$h0;
PHPSESSID=454b72dc00e908524384cdb37fddb976; _gat_UA-21812781-2=1;
_ga_9VW1HCFL7C=GS2.1.s1747997430$o42$g1$t1747999957$j0$l0$h0;
_ga=GA1.1.1470230669.1747201268',
# }
# custom_settings = {
# "DOWNLOAD_HANDLERS": {
# "http": "scrapy_impersonate.ImpersonateDownloadHandler",
# "https": "scrapy_impersonate.ImpersonateDownloadHandler",
# },
# "TWISTED_REACTOR":
"[Link]",
# }
#
# def __init__(self):
# # Store all parsed data in a list for final export
# [Link] = []
# self.raw_data=[]
# self.data_cleaned = []
# self.data_uncleaned = []
# [Link] = [Link]([Link](), "exports") # or specify your own
path
#
# # Create the directory if it doesn't exist
# [Link]([Link], exist_ok=True)
# [Link] = "[Link]
company_op=starts&company=&offense_group=&agency_code=NHTSA"
# def start_requests(self):
# browsers = [
# "chrome110",
# "edge99",
# "safari15_5"
# ]
# meta = {}
# meta['impersonate'] = [Link](browsers)
# meta['source_url'] = [Link]
#
# # Send initial request with custom headers and cookies
# yield [Link](url=[Link],meta=meta, headers=[Link],
cookies=[Link], callback=self.parse_listing)
#
# def parse_listing(self, response):
# company_links =
[Link]('//table[2]/tbody/tr/td[1]/a/@href').getall()
# source_url = [Link]('source_url')
#
# for link in company_links:
# browsers = [
# "chrome110",
# "edge99",
# "safari15_5"
# ]
# meta = {}
# meta['impersonate'] = [Link](browsers)
# meta['source_url']=source_url
#
# yield [Link](link,meta=meta, headers=[Link],
cookies=[Link], callback=self.parse_details)
# # break # Use during testing to limit pagination
#
# # Handle pagination by identifying total pages and iterating through all
# next_page = [Link]('//a[contains(text(),">>")]/@href').get()
# if next_page:
# total_page = int(next_page.split('page=')[-1])
# for page in range(0, total_page+1):
# paginated_url = f"{[Link]}&page={page}"
# browsers = [
# "chrome110",
# "edge99",
# "safari15_5"
# ]
# meta = {}
# meta['impersonate'] = [Link](browsers)
# meta['source_url']=paginated_url
#
# yield [Link](url=paginated_url,meta=meta,
headers=[Link], cookies=[Link], callback=self.parse_listing)
#
# def parse_details(self, response):
# # Known countries list
# self.known_countries = ['india ', 'usa','us', 'united states', 'uk',
'canada', 'australia', 'bahamas',
# 'singapore', 'germany', 'france', 'hong kong',
'north america', 'america']
# source_url = [Link]('source_url')
#
# # Extract all key-value details from the company profile page
# item = dict()
# raw_item = dict() # For storing completely raw data
#
# item["PDP URL"] = [Link]
# raw_item["PDP URL"] = [Link]
#
# for path in [Link]('//div[@id="contentResult"]//b'):
# item['source_url']=source_url
#
# data_skip = False
# key = [Link]('./text()').get(default='NA').strip()
# raw_value = \
# ''.join([Link]('./following-sibling::text() | ./following-
sibling::a//text()').getall()).split(
# '\n:')[0].replace(':\xa0', '').strip()
# value = raw_value # Initialize cleaned value with raw value
#
# # Store raw value first
# raw_item[key] = raw_value
#
# # Format date fields into YYYY-MM-DD format (only for cleaned data)
# if 'Date' in key:
# try:
# date_obj = [Link](value, "%B %d, %Y")
# value = date_obj.strftime("%Y-%m-%d")
# except Exception:
# pass
# # Remove "Note:" prefix and clean whitespace if key is "Note"
# if key == 'Notes':
# value = [Link](r'^Note:\s*', '', value).strip()
#
# # Special handling for Company field
# if key == 'Company':
# # First handle dba/d/b/a cases - extract alias and remove from
main name
# dba_pattern = r'(.*?)\s+(dba|d/b/a|doing\s+business\s+as)\s+(.+)'
# dba_match = [Link](dba_pattern, value, flags=[Link])
#
# aliases = []
# if dba_match:
# # Extract main name and alias
# main_name = dba_match.group(1).strip()
# dba_alias = dba_match.group(3).strip()
#
# # Simple cleaning - just remove trailing punctuation
# dba_alias = [Link](r'[.,;]+$', '', dba_alias).strip()
#
# # Add to aliases list and update value
# if dba_alias:
# [Link](dba_alias)
# value = main_name # Use just the main name part
#
# # Check if we should split on "and"
# split_keywords = [
# 'inc', 'inc ', 'incorporated', 'corp', 'corp ', 'Inc.',
'corporation',
# 'llc', 'ltd', 'limited', 'lp', 'llp', 'plc', 'communities',
'center',
# 'co', 'company', 'group'
# ]
#
# parts = [Link](r'\s+(and)\s+', value, flags=[Link])
# should_split = False
#
# if len(parts) > 1:
# left_part = parts[0].strip()
# right_part = parts[-1].strip()
#
# # Check if both parts end with company suffixes
# left_has_suffix = any([Link](rf'\b{kw}\.?$', left_part,
[Link]) for kw in split_keywords)
# right_has_suffix = any(
# [Link](rf'\b{kw}\.?$', right_part, [Link]) for
kw in split_keywords)
#
# should_split = left_has_suffix and right_has_suffix
#
# if should_split:
# # Handle the split case
# company_names = [parts[0].strip(), parts[2].strip()]
#
# # Create new items for each company
# items = []
# for name in company_names:
# new_item = [Link]()
# new_item['Company'] = self.clean_text_punct(name)
#
# # Only keep aliases for the first item if they exist
# if name == company_names[0] and aliases:
# for idx, alias_val in enumerate(aliases[:2],
start=1):
# new_item[f'Alias {idx}'] = alias_val
#
# [Link](new_item)
#
# return items
#
# # If we shouldn't split, proceed with normal processing
# # Now process parentheses content
# parens = [Link](r'$([^()]+)$', value)
# country = None
#
# # Check inside () for country or special cases
# for val in parens:
# cleaned_val = [Link]().lower()
# # Remove phrases like "now known as", "formerly", etc.
# cleaned_val = [Link](r'^(now known as|formerly known as|also
known as)\s+', '', cleaned_val,
# flags=[Link])
# if 'us' in cleaned_val:
# print("found")
#
# if cleaned_val in self.known_countries:
# # Preserve country in parentheses
# value = [Link](r'$' + [Link](val) + r'$', f'
({val})', value)
# country = cleaned_val
# else:
# if f'({val})' in value:
# [Link](cleaned_val)
#
# # Remove () content only if not country
# for val in parens:
# if [Link]() not in self.known_countries:
# value = [Link](r'$' + [Link](val) + r'$', ' ',
value)
#
# # Clean up company name
# company_name = self.clean_text_punct([Link]())
# item[key] = company_name
#
# # Assign aliases (simple assignment without splitting)
# for idx, alias_val in enumerate(aliases[:2], start=1):
# item[f'Alias {idx}'] = alias_val
#
# continue
# # Handle 'and' in aliases
# # split_aliases = [Link](r'\s+and\s+', alias_val,
flags=[Link])
# # for i, part in enumerate(split_aliases, start=idx):
# # part = [Link]()
# # if part:
# # part = [Link](r'^and\s*', '', part,
flags=[Link])
# # item[f'Alias {i}'] = part
# # idx += len(split_aliases) - 1
#
# # continue
#
# # Handle Current Parent Company (similar to Company but without
alias/country extraction)
# if key == 'Current Parent Company':
# # Clean up punctuation (keeping - ' & /)
# text_no_punct = [Link]([Link]('', '',
[Link]('-', '')
# .replace("'", '')
# .replace('&', '')
# .replace('/', '')))
# value = [Link](r'\s+', ' ', text_no_punct).strip()
# item[key + " name"] = value
# item[key + " url"] =
[Link]('./following-sibling::a/@href').get(default='')
# raw_item[key + " name"] = raw_value
# raw_item[key + " url"] =
[Link]('./following-sibling::a/@href').get(default='')
# continue
#
# # Handle special keys (Mega-Scandal, Source of Data)
# if 'Mega-Scandal' in key or 'Source of Data' in key:
# if 'Mega-Scandal' in key:
# item[key + " name"] = value
# item[key + " url"] =
[Link]('./following-sibling::a/@href').get(default='')
# raw_item[key + " name"] = raw_value
# raw_item[key + " url"] =
[Link]('./following-sibling::a/@href').get(default='')
# else:
# value =
[Link]('./following-sibling::a/@href').get(default='')
# raw_item[key] = value
# item[key] = value
# data_skip = True
#
# # Handle link-based values (e.g., "click here")
# if not data_skip:
# if "(click here)" in [Link]():
# value =
[Link]('./following-sibling::a/@href').get(default='')
# raw_item[key] = value
# item[key] = value # All other fields keep original punctuation
#
# # Additional handling for specific URL columns
# url_columns = ['Current Parent Company url', 'Archived Source', 'Mega-
Scandal url']
# for url_col in url_columns:
# if url_col in item and isinstance(item[url_col], str) and "(click
here)" in item[url_col].lower():
# url_value = [Link](
# f'//b[contains(text(), "{url_col.replace(" url",
"")}")]/following-sibling::a/@href').get(
# default='')
# item[url_col] = url_value
# raw_item[url_col] = url_value
#
# split_items = self.split_company_names(item)
# for new_item in split_items:
# [Link](new_item)
# self.raw_data.append(raw_item)
#
# def split_company_names(self, item):
# split_keywords = [
# # Common US suffixes
# 'inc', 'inc ', 'incorporated', 'corp', 'corp ', 'Inc.',
'corporation',
# 'llc', 'ltd', 'limited', 'lp', 'llp', 'plc', 'communities', 'center',
# 'co', 'company', 'group'
# ]
# company_name = item['Company']
#
# # First handle dba/d/b/a cases - this needs to happen BEFORE any other
processing
# dba_pattern = r'^(.*?)\s+(dba|d/b/a|doing\s+business\s+as)\s+(.+)$'
# dba_match = [Link](dba_pattern, company_name, flags=[Link])
#
# if dba_match:
# # If we find dba pattern, process it immediately
# main_name = dba_match.group(1).strip()
# dba_alias = dba_match.group(3).strip()
#
# # Clean the alias
# dba_alias = [Link](r'[.,;]+$', '', dba_alias)
# dba_alias = [Link](r'$[^)]*$$', '', dba_alias).strip()
#
# # Create new item with main name and alias
# new_item = [Link]()
# new_item['Company'] = main_name
# new_item['Alias 1'] = dba_alias.lower()
#
# # Remove any other aliases since we're creating a fresh copy
# for k in list(new_item.keys()):
# if [Link]('Alias ') and k != 'Alias 1':
# del new_item[k]
#
# return [new_item]
#
# # Rest of your existing splitting logic for "and" cases
# # Split on 'and' or '&' with surrounding whitespace
# parts = [Link](r'\s+(and)\s+', company_name, flags=[Link])
#
# # If no splitting occurred, return original item in a list
# if len(parts) == 1:
# return [item]
#
# result_items = []
# current = parts[0].strip()
# i = 1
#
# while i < len(parts):
# sep = parts[i]
# next_part = parts[i + 1].strip()
#
# # Check if current ends with a keyword
# keyword_match = False
# for kw in split_keywords:
# if [Link](rf'\b{kw}\.?$', current, [Link]):
# keyword_match = True
# break
#
# if keyword_match:
# # Create a new item with the current company name
# new_item = [Link]()
#
# # Remove any aliases from the item copy since we'll reprocess
them
# for k in list(new_item.keys()):
# if [Link]('Alias '):
# del new_item[k]
#
# new_item['Company'] = current
# result_items.append(new_item)
# current = next_part
# else:
# current += f" {sep} {next_part}"
#
# i += 2
#
# # Add the last remaining part
# new_item = [Link]()
#
# # Remove any aliases from the item copy since we'll reprocess them
# for k in list(new_item.keys()):
# if [Link]('Alias '):
# del new_item[k]
#
# new_item['Company'] = current
# result_items.append(new_item)
#
# # Now reprocess each item to extract aliases specific to that company
# final_items = []
# for split_item in result_items:
# # Make a copy to avoid modifying the original
# processed_item = split_item.copy()
# company_name = processed_item['Company']
#
# # Extract and process aliases just for this company
# parens = [Link](r'$([^()]+)$', company_name)
# aliases = []
# country = None
#
# # Check inside () for country or aliases
# for val in parens:
# cleaned_val = [Link]().lower()
# if cleaned_val.upper() != "N.A.":
# # Remove phrases like "now known as", "formerly", etc.
# cleaned_val = [Link](r'^(now known as|formerly known as|also
known as)\s+', '', cleaned_val,
# flags=[Link])
#
# # Check if it's a known country
# if cleaned_val in self.known_countries:
# country = cleaned_val
# else:
# [Link](cleaned_val)
#
# # Handle dba or d/b/a aliases in main text - improved pattern
# dba_pattern = r'^(.*?)\s+(dba|d/b/a|doing\s+business\s+as)\s+(.+)$'
# dba_match = [Link](dba_pattern, company_name, flags=[Link])
# if dba_match:
# main_name = dba_match.group(1).strip()
# dba_alias = dba_match.group(3).strip()
# # Clean the alias
# dba_alias = [Link](r'[.,;]+$', '', dba_alias)
# dba_alias = [Link](r'$[^)]*$$', '', dba_alias).strip()
# if dba_alias:
# [Link](0, dba_alias.lower()) # Add at beginning as
primary alias
# processed_item['Company'] = main_name
#
# # Remove () content from company name
# processed_item['Company'] = [Link](r'$[^()]+$', '',
processed_item['Company']).strip()
#
# # Assign aliases if any (up to 2 aliases)
# for idx, alias_val in enumerate(aliases[:2], start=1):
# processed_item[f'Alias {idx}'] = alias_val
#
# final_items.append(processed_item)
#
# return final_items
# # def split_company_names(self, item):
# # split_keywords = [
# # # Common US suffixes
# # 'inc', 'inc ', 'incorporated', 'corp', 'corp ', 'corporation',
# # 'llc', 'ltd', 'limited', 'lp', 'llp', 'plc', 'communities',
'center',
# # 'co', 'company', 'group'
# # ]
# # company_name = item['Company']
# #
# # # First extract all parentheses content and their positions
# # paren_matches = list([Link](r'$([^()]+)$', company_name))
# # paren_contents = [[Link](1) for match in paren_matches]
# #
# # # Split on 'and' or '&' with surrounding whitespace
# # parts = [Link](r'\s+(and)\s+', company_name, flags=[Link])
# #
# # # If no splitting occurred, return original item in a list
# # if len(parts) == 1:
# # return [item]
# #
# # result_items = []
# # current = parts[0].strip()
# # i = 1
# #
# # while i < len(parts):
# # sep = parts[i]
# # next_part = parts[i + 1].strip()
# #
# # # Check if current or next_part ends with a keyword
# # keyword_match = False
# # for kw in split_keywords:
# # if [Link](rf'\b{kw}\.?$', current, [Link]):
# # keyword_match = True
# # break
# #
# # if keyword_match:
# # # Create a new item with the current company name
# # new_item = [Link]()
# #
# # # Find any parentheses that were in this part of the string
# # current_with_parens = current
# # for match in paren_matches:
# # if [Link](0) in current:
# # current_with_parens = [Link]([Link](0),
f'({[Link](1)})')
# #
# # new_item['Company'] = current_with_parens
# # result_items.append(new_item)
# # current = next_part
# # else:
# # current += f" {sep} {next_part}"
# #
# # i += 2
# #
# # # Add the last remaining part
# # new_item = [Link]()
# #
# # # Find any parentheses that were in this part of the string
# # current_with_parens = current
# # for match in paren_matches:
# # if [Link](0) in current:
# # current_with_parens = [Link]([Link](0),
f'({[Link](1)})')
# #
# # new_item['Company'] = current_with_parens
# # result_items.append(new_item)
# #
# # return result_items
#
# # def clean_text_punct(text):
# # """Replace punctuation with space (except - ' & /), then normalize
whitespace"""
# # if not isinstance(text, str) or [Link]() == '':
# # return text
# # keep_chars = {"&", "-", "/", "'"}
# # cleaned = ''.join(char if char not in [Link] or char in
keep_chars else ' ' for char in text)
# # return [Link](r'\s+', ' ', cleaned).strip()
#
# def extract_name_alias(self, entry):
# """
# Extract country from a given company name string.
# Returns a tuple: (name, country)
# """
# try:
# if not isinstance(entry, str):
# return entry, None
#
# entry = [Link]()
# country = None
#
# # Extract and remove country from parentheses
# parens = [Link](r'$([^()]+)$', entry)
# for val in parens:
# if [Link]() in self.known_countries:
# country = [Link]()
# entry = [Link](r'$' + [Link](val) + r'$', '',
entry).strip()
#
# return entry, country
#
# except Exception:
# return entry, None
# # 2. Remove punctuation except &-/'
# def clean_text_punct(self,text):
# """Clean punctuation and whitespace for Company-related fields"""
# if not isinstance(text, str) or [Link]() == '':
# return text
# keep_chars = {"&", "-", "/", "'"}
# cleaned = []
# text = [Link](".", " ")
#
# # Process the text and preserve country names inside parentheses
# parens = [Link](r'$([^()]+)$', text) # Extract all content inside
parentheses
# if "HK" not in text:
# text = [Link]("(", "abcdef")
# text = [Link](")", "fedcba")
#
#
# for char in text:
# if char in [Link] and char not in keep_chars:
# [Link](' ')
# else:
# [Link](char)
# cleaned = ''.join(cleaned)
# cleaned = [Link]("abcdef", "(")
# cleaned = [Link]("fedcba", ")")
#
# return [Link](r'\s+', ' ', ''.join(cleaned)).strip()
#
# def normalize_whitespace(self,text):
# """Normalize whitespace (used for non-Company fields)"""
# if not isinstance(text, str) or [Link]() == '':
# return text
# return [Link](r'\s+', ' ', text).strip()
#
#
# def close(self, reason):
# # Create DataFrames
# df_cleaned = [Link]([Link])
# df_uncleaned = [Link](self.raw_data) # Use completely raw data
#
# # Add common columns to both DataFrames
# df_cleaned.insert(0, 'ID', range(1, 1 + len(df_cleaned)))
#
# # Ensure source_url is second column
# if 'source_url' in df_cleaned.columns:
# cols = df_cleaned.[Link]()
# [Link](1, [Link]([Link]('source_url')))
# df_cleaned = df_cleaned[cols]
#
# # Only add ID to uncleaned data (without Source URL)
# df_uncleaned.insert(0, 'ID', range(1, 1 + len(df_uncleaned)))
# # df_uncleaned = df_uncleaned.drop(columns=['Current Parent Company
url','Source of Data'])
#
# # Save uncleaned file (completely raw data)
# timestamp = [Link]().strftime('%Y-%m-%d_%H%M%S')
# filename_uncleaned =
f"raw_data_violationtracker_goodjobsfirst_org_{timestamp}.xlsx"
# filepath_uncleaned = [Link]([Link], filename_uncleaned)
#
# with [Link](filepath_uncleaned, engine='xlsxwriter',
# engine_kwargs={'options': {'strings_to_numbers':
True}}) as writer:
# df_uncleaned.fillna("").to_excel(writer, index=False)
#
# # Process cleaned data
# protected_columns = ['ID', 'Source URL', 'PDP URL', 'Penalty', 'Date',
# 'Current Parent Company url', 'Archived Source',
'Mega-Scandal url']
# columns_to_clean = ['Company', 'Current Parent Company name']
#
# # 1. Replace N/A with blank except in protected columns
# for col in df_cleaned.columns:
# if col not in protected_columns:
# df_cleaned[col] = df_cleaned[col].replace(['N/A', 'NA'], '')
# columns_to_clean = ['Company', 'Current Parent Company']
#
# # Apply per-column logic
# for col in df_cleaned.columns:
# if col in protected_columns:
# continue
# if df_cleaned[col].dtype == object:
# if col in columns_to_clean:
# df_cleaned[col] =
df_cleaned[col].apply(self.clean_text_punct)
# else:
# df_cleaned[col] =
df_cleaned[col].apply(self.normalize_whitespace)
#
# # Save cleaned file
# filename_cleaned =
f"violationtracker_goodjobsfirst_org_{timestamp}.xlsx"
# filepath_cleaned = [Link]([Link], filename_cleaned)
#
# with [Link](filepath_cleaned, engine='xlsxwriter',
# engine_kwargs={'options': {'strings_to_numbers':
True}}) as writer:
# df_cleaned.to_excel(writer, index=False)
# # Columns that should have punctuation removed
#
#
# if __name__ == '__main__':
# execute("scrapy crawl NHTSA".split())
import random
import re
import os
import string
import scrapy
import pandas as pd
from datetime import datetime
from [Link] import execute

# Spider class for scraping violation data from Good Jobs First (OFAC-related
entries)
class Act110Spider([Link]):
name = "NHTSA"

# Preset cookies required for session management and access

cookies = {
'_fbp': 'fb.1.1747201269861.315460754820040010',
'_gid': 'GA1.2.38545643.1747749932',
'_ga_Q4G4E8KT5J': 'GS2.1.s1747827357$o2$g1$t1747827638$j0$l0$h0',
'PHPSESSID': '454b72dc00e908524384cdb37fddb976',
'_gat_UA-21812781-2': '1',
'_ga_9VW1HCFL7C': 'GS2.1.s1747997430$o42$g1$t1747999957$j0$l0$h0',
'_ga': 'GA1.1.1470230669.1747201268',
}

headers = {
'accept':
'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/
webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8,hi;q=0.7',
'cache-control': 'no-cache',
'pragma': 'no-cache',
'priority': 'u=0, i',
'sec-ch-ua': '"Chromium";v="136", "Google Chrome";v="136",
"Not.A/Brand";v="99"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'sec-fetch-dest': 'document',
'sec-fetch-mode': 'navigate',
'sec-fetch-site': 'none',
'sec-fetch-user': '?1',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36
(KHTML, like Gecko) Chrome/[Link] Safari/537.36',
# 'cookie': '_fbp=fb.1.1747201269861.315460754820040010;
_gid=GA1.2.38545643.1747749932;
_ga_Q4G4E8KT5J=GS2.1.s1747827357$o2$g1$t1747827638$j0$l0$h0;
PHPSESSID=454b72dc00e908524384cdb37fddb976; _gat_UA-21812781-2=1;
_ga_9VW1HCFL7C=GS2.1.s1747997430$o42$g1$t1747999957$j0$l0$h0;
_ga=GA1.1.1470230669.1747201268',
}
custom_settings = {
"DOWNLOAD_HANDLERS": {
"http": "scrapy_impersonate.ImpersonateDownloadHandler",
"https": "scrapy_impersonate.ImpersonateDownloadHandler",
},
"TWISTED_REACTOR":
"[Link]",
}

def __init__(self):
# Store all parsed data in a list for final export
[Link] = []
self.raw_data=[]
self.data_cleaned = []
self.data_uncleaned = []
[Link] = [Link]([Link](), "exports") # or specify your own
path

# Create the directory if it doesn't exist

[Link]([Link], exist_ok=True)
[Link] = "[Link]
company_op=starts&company=&offense_group=&agency_code=NHTSA"
def start_requests(self):
browsers = [
"chrome110",
"edge99",
"safari15_5"
]
meta = {}
meta['impersonate'] = [Link](browsers)
meta['source_url'] = [Link]

# Send initial request with custom headers and cookies

yield [Link](url=[Link],meta=meta, headers=[Link],
cookies=[Link], callback=self.parse_listing)

def parse_listing(self, response):

company_links =
[Link]('//table[2]/tbody/tr/td[1]/a/@href').getall()
source_url = [Link]('source_url')

yield [Link](link,meta=meta, headers=[Link],

cookies=[Link], callback=self.parse_details)
# break # Use during testing to limit pagination

# Handle pagination by identifying total pages and iterating through all

next_page = [Link]('//a[contains(text(),">>")]/@href').get()
if next_page:
total_page = int(next_page.split('page=')[-1])
for page in range(0, total_page+1):
paginated_url = f"{[Link]}&page={page}"
browsers = [
"chrome110",
"edge99",
"safari15_5"
]
meta = {}
meta['impersonate'] = [Link](browsers)
meta['source_url']=paginated_url

yield [Link](url=paginated_url,meta=meta,
headers=[Link], cookies=[Link], callback=self.parse_listing)

def parse_details(self, response):

# Known countries list
self.known_countries = ['india ', 'usa', 'united states','us', 'uk',
'canada', 'australia', 'bahamas', 'bahamas',
'singapore', 'germany', 'france', 'hong kong',
'north america', 'america']
source_url = [Link]('source_url')

# Extract all key-value details from the company profile page

item = dict()
raw_item = dict() # For storing completely raw data

item["PDP URL"] = [Link]

raw_item["PDP URL"] = [Link]

for path in [Link]('//div[@id="contentResult"]//b'):

item['source_url']=source_url

data_skip = False
key = [Link]('./text()').get(default='NA').strip()
raw_value = \
''.join([Link]('./following-sibling::text() | ./following-
sibling::a//text()').getall()).split(
'\n:')[0].replace(':\xa0', '').strip()
value = raw_value # Initialize cleaned value with raw value

# Store raw value first

raw_item[key] = raw_value

# Format date fields into YYYY-MM-DD format (only for cleaned data)
if 'Date' in key:
try:
date_obj = [Link](value, "%B %d, %Y")
value = date_obj.strftime("%Y-%m-%d")
except Exception:
pass
# Remove "Note:" prefix and clean whitespace if key is "Note"
if key == 'Notes':
value = [Link](r'^Note:\s*', '', value).strip()

# Special handling for Company field

if key == 'Company':
# Define company suffixes for splitting checks
split_keywords = [
'inc', 'inc ', 'incorporated', 'corp', 'corp ',
'corporation','c.v.',
'llc', 'ltd', 'limited', 'lp', 'llp', 'plc', 'communities',
'center',
'co', 'company', 'group' # <-- FIXED: added comma after
'center'
]

dba_pattern = r'(.*?)\s+(trading as|dba|d/b/a|doing\s+business\

s+as)\s+(.+?)(?:\s*,\s*and\s+([^,]+))?$'
dba_match = [Link](dba_pattern, value, flags=[Link])

aliases = []
if dba_match:
# Extract all parts
main_name = dba_match.group(1).strip()
alias_part = dba_match.group(3).strip() # Part after d/b/a
and_part = dba_match.group(4).strip() if dba_match.group(4)
else None

# Clean the aliases

alias_part = [Link](r'[.,;]+$', '', alias_part)
alias_part = [Link](r'$[^)]*$$', '', alias_part).strip()
if and_part:
and_part = [Link](r'[.,;]+$', '', and_part)
and_part = [Link](r'$[^)]*$$', '', and_part).strip()

# Check if alias_part contains company suffixes

has_suffix = any(
[Link](rf'\b{[Link](kw)}\b\.?$', alias_part,
flags=[Link])
for kw in split_keywords
)

# NEW LOGIC: If suffix found in alias_part, split into separate

aliases
if has_suffix:
[Link](alias_part.lower())
if and_part:
[Link](and_part.lower())
else:
# If no suffix, combine into single alias
combined = alias_part
if and_part:
combined += f" {and_part}"
[Link]([Link]())

value = main_name # Use just the main name part

# REST OF YOUR EXISTING CODE (UNCHANGED)

=================================
# Process parentheses content

parens = [Link](r'$([^()]+)$', value)

country = None

for val in parens:

cleaned_val = [Link]().lower()
cleaned_val = [Link](r'^(now known as|formerly known as|also
known as|trading as)\s+', '',
cleaned_val, flags=[Link])

if cleaned_val.lower() in self.known_countries:
value = [Link](r'$' + [Link](val) + r'$', f' ({val})',
value)
country = cleaned_val
else:
if f'({val})' in value:
[Link](cleaned_val)

for val in parens:

if [Link]() not in self.known_countries:
value = [Link](r'$' + [Link](val) + r'$', ' ', value)

# Extract aliases from other known keywords (fka, aka, etc.)

alias_keywords_regex = r'\b(?:f/k/a|fka|a/k/a|aka|trading as)\b'
parts = [Link](alias_keywords_regex, value, flags=[Link])
parts = [[Link]().strip(' ,;') for p in parts if [Link]()]

if parts:
company_name = parts[0]
extracted_aliases = parts[1:]
cleaned_aliases = [
[Link](r'^[/\s]+', '', [Link]().strip())
for a in extracted_aliases
if len([Link]()) >= 3 and [Link]().strip() not in ["hk"]
]
[Link](cleaned_aliases)
else:
company_name = [Link]()

company_name = self.clean_text_punct(company_name)

# 🔽 NEW: Extract additional info

# company_name, additional_info =
self.extract_additional_info(company_name)
item[key] = company_name
# if additional_info:
# item['Additional Info'] = additional_info

# Assign aliases
for idx, alias_val in enumerate(aliases[:2], start=1):
alias_val = [Link](r'^[/\s]+', '', alias_val)
remove_words = ['fka', 'f/k/a', 'aka', 'a/k/a']
pattern = r'^(?:' + '|'.join([Link](word) for word in
remove_words) + r')\s+'
alias_val = [Link](pattern, '', alias_val, flags=[Link])
split_aliases = [Link](r'\s+and\s+', alias_val,
flags=[Link])

for i, part in enumerate(split_aliases, start=idx):

part = [Link]()
if part:
part = [Link](r'^and\s*', '', part,
flags=[Link])
item[f'Alias {i}'] = part

idx += len(split_aliases) - 1

continue

# Handle Current Parent Company (similar to Company but without

alias/country extraction)
if key == 'Current Parent Company':
# Clean up punctuation (keeping - ' & /)
text_no_punct = [Link]([Link]('', '',
[Link]('-', '')
.replace("'", '')
.replace('&', '')
.replace('/', '')))
value = [Link](r'\s+', ' ', text_no_punct).strip()
item[key + " name"] = value
item[key + " url"] =
[Link]('./following-sibling::a/@href').get(default='')
raw_item[key + " name"] = raw_value
raw_item[key + " url"] =
[Link]('./following-sibling::a/@href').get(default='')
continue

# Handle special keys (Mega-Scandal, Source of Data)

if 'Mega-Scandal' in key or 'Source of Data' in key:
if 'Mega-Scandal' in key:
item[key + " name"] = value
item[key + " url"] =
[Link]('./following-sibling::a/@href').get(default='')
raw_item[key + " name"] = raw_value
raw_item[key + " url"] =
[Link]('./following-sibling::a/@href').get(default='')
else:
value =
[Link]('./following-sibling::a/@href').get(default='')
raw_item[key] = value
item[key] = value
data_skip = True

# Handle link-based values (e.g., "click here")

if not data_skip:
if "(click here)" in [Link]():
value =
[Link]('./following-sibling::a/@href').get(default='')
raw_item[key] = value
item[key] = value # All other fields keep original punctuation

# Additional handling for specific URL columns

url_columns = ['Current Parent Company url', 'Archived Source', 'Mega-
Scandal url']
for url_col in url_columns:
if url_col in item and isinstance(item[url_col], str) and "(click
here)" in item[url_col].lower():
url_value = [Link](
f'//b[contains(text(), "{url_col.replace(" url",
"")}")]/following-sibling::a/@href').get(
default='')
item[url_col] = url_value
raw_item[url_col] = url_value

# Reorder the fields before splitting/appending

reordered_item = dict()
alias_keys = sorted([k for k in [Link]() if [Link]("Alias ")],
key=lambda x: int([Link]()[1]))
other_keys = [k for k in [Link]() if not [Link]("Alias ") and k !=
'Additional Info']

for key in other_keys:

reordered_item[key] = item[key]
for key in alias_keys:
reordered_item[key] = item[key]
if 'Additional Info' in item:
reordered_item['Additional Info'] = item['Additional Info']

split_items = self.split_company_names(reordered_item)
for new_item in split_items:
[Link](new_item)

self.raw_data.append(raw_item)

def split_company_names(self, item):

split_keywords = [
'inc', 'incorporated', 'corp', 'corporation', 'c.v.',
'llc', 'ltd', 'limited', 'lp', 'llp', 'plc', 'co', 'company'
]
company_name = item['Company']
# Split on both "and" and ";" (keeping separators)
if 'a division of volvo' in company_name.lower():
print()
parts = [Link](r'\s*(and|;)\s*', company_name, flags=[Link])

if len(parts) == 1: # No splitting occurred

return [item]

result_items = []
current = parts[0].strip()
i = 1

while i < len(parts):

sep = parts[i] # "and" or ";"
next_part = parts[i + 1].strip()

# Check if current or next_part ends with a keyword

current_has_keyword = any(
[Link](rf'\b{kw}\.?$', current, [Link])
for kw in split_keywords
)
next_has_keyword = any(
[Link](rf'\b{kw}\.?$', next_part, [Link])
for kw in split_keywords
)

if sep == ";" or current_has_keyword or next_has_keyword:

# If separator is ";", always split (regardless of keywords)
new_item = [Link]()
new_item['Company'] = current
result_items.append(new_item)
current = next_part
else:
# If separator is "and" and no keywords, merge
current += f" {sep} {next_part}"

i += 2

# Add the last remaining part

if current:
new_item = [Link]()
new_item['Company'] = current
result_items.append(new_item)

return result_items

def extract_name_alias(self, entry):

"""
Extract country from a given company name string.k
Returns a tuple: (name, country)
"""
try:
if not isinstance(entry, str):
return entry, None

entry = [Link]()
country = None
# Extract and remove country from parentheses
parens = [Link](r'$([^()]+)$', entry)
for val in parens:
if [Link]().lower() in self.known_countries:
country = [Link]()
entry = [Link](r'$' + [Link](val) + r'$', '',
entry).strip()

return entry, country

except Exception:
return entry, None

def clean_text_punct(self, text):

"""Clean punctuation and whitespace for Company-related fields"""
if not isinstance(text, str) or [Link]() == '':
return text
keep_chars = {"&", "-", "/", "'", ".",";"}
cleaned = []

# Process the text and preserve country names inside parentheses

if "HK" not in text:
text = [Link]("(", "abcdef")
text = [Link](")", "fedcba")

for char in text:

if char in [Link] and char not in keep_chars:
[Link](' ')
else:
[Link](char)

cleaned = ''.join(cleaned)
cleaned = [Link]("abcdef", "(")
cleaned = [Link]("fedcba", ")")

return [Link](r'\s+', ' ', cleaned).strip()

def normalize_whitespace(self,text):
"""Normalize whitespace (used for non-Company fields)"""
if not isinstance(text, str) or [Link]() == '':
return text
return [Link](r'\s+', ' ', text).strip()
#
# def extract_additional_info(self, entry):
# """
# Extracts additional information like 'Division of', 'A unit of', 'A
business of', etc.
# Returns tuple: (cleaned_entry, additional_info)
# """
# if not isinstance(entry, str):
# return entry, None
#
# entry = [Link]()
# additional_info = None
#
# # List of patterns to search for
# patterns = [
# r'\b(a Division of\s+.+)',
# r'\b(A\s+unit of\s+.+)',
# r'\b(A\s+business of\s+.+)',
# r'\b(A\s+subsidiary of\s+.+)',
# r'\b(Segment of\s+.+)',
# r'\b(Sub-division of\s+.+)',
# ]
#
# for pattern in patterns:
# match = [Link](pattern, entry, flags=[Link])
# if match:
# additional_info = [Link](0).strip()
# entry = [Link]([Link](additional_info), '', entry,
flags=[Link]).strip()
# break # Only extract the first matching additional info
#
# # Clean extra spaces and dangling "by", "of", "a"
# entry = [Link](r'\b(of|by|a)\s*$', '', entry,
flags=[Link]).strip()
#
# return entry, additional_info

def close(self, reason):

# Create DataFrames
df_cleaned = [Link]([Link])
df_uncleaned = [Link](self.raw_data) # Use completely raw data

# Add common columns to both DataFrames

df_cleaned.insert(0, 'ID', range(1, 1 + len(df_cleaned)))

# Ensure source_url is second column

if 'source_url' in df_cleaned.columns:
cols = df_cleaned.[Link]()
[Link](1, [Link]([Link]('source_url')))
df_cleaned = df_cleaned[cols]

# Only add ID to uncleaned data (without Source URL)

df_uncleaned.insert(0, 'ID', range(1, 1 + len(df_uncleaned)))
# df_uncleaned = df_uncleaned.drop(columns=['Current Parent Company
url','Source of Data'])

# Save uncleaned file (completely raw data)

timestamp = [Link]().strftime('%Y-%m-%d_%H%M%S')
filename_uncleaned =
f"raw_data_violationtracker_goodjobsfirst_org_{timestamp}.xlsx"
filepath_uncleaned = [Link]([Link], filename_uncleaned)

with [Link](filepath_uncleaned, engine='xlsxwriter',

engine_kwargs={'options': {'strings_to_numbers':
True}}) as writer:
df_uncleaned.fillna("").to_excel(writer, index=False)

# Process cleaned data

protected_columns = ['ID', 'Source URL', 'PDP URL', 'Penalty', 'Date',
'Current Parent Company url', 'Archived Source',
'Mega-Scandal url']
columns_to_clean = ['Company', 'Current Parent Company name']

# 1. Replace N/A with blank except in protected columns

for col in df_cleaned.columns:
if col not in protected_columns:
df_cleaned[col] = df_cleaned[col].replace(['N/A', 'NA'], '')
columns_to_clean = ['Company', 'Current Parent Company']

# Apply per-column logic

for col in df_cleaned.columns:
if col in protected_columns:
continue
if df_cleaned[col].dtype == object:
if col in columns_to_clean:
df_cleaned[col] = df_cleaned[col].apply(self.clean_text_punct)
else:
df_cleaned[col] =
df_cleaned[col].apply(self.normalize_whitespace)
# Ensure all 'Alias' columns are sorted and moved to the end
alias_cols = sorted([col for col in df_cleaned.columns if
[Link]('Alias ')],
key=lambda x: int([Link]()[1]) if [Link]()
[1].isdigit() else float('inf'))

# Get all non-alias columns

non_alias_cols = [col for col in df_cleaned.columns if col not in
alias_cols]

# Reorder columns: non-alias first, then alias

df_cleaned = df_cleaned[non_alias_cols + alias_cols]

# Save cleaned file

filename_cleaned =
f"violationtracker_goodjobsfirst_org_{timestamp}.xlsx"
filepath_cleaned = [Link]([Link], filename_cleaned)

with [Link](filepath_cleaned, engine='xlsxwriter',

engine_kwargs={'options': {'strings_to_numbers':
True}}) as writer:
df_cleaned.to_excel(writer, index=False)
# Columns that should have punctuation removed

if __name__ == '__main__':
execute("scrapy crawl NHTSA".split())

Scrapy Spider for NHTSA Violation Data
No ratings yet
Scrapy Spider for NHTSA Violation Data
22 pages
Scraping DOJ Violation Data with Scrapy
No ratings yet
Scraping DOJ Violation Data with Scrapy
12 pages
Scrapy Spider for EEOC Violation Data
No ratings yet
Scrapy Spider for EEOC Violation Data
10 pages
Scrapy Spider for DOJ Civil Violations
No ratings yet
Scrapy Spider for DOJ Civil Violations
8 pages
OFAC Violation Data Scraper Script
No ratings yet
OFAC Violation Data Scraper Script
7 pages
Scrapy Spider for CMS Violation Data
No ratings yet
Scrapy Spider for CMS Violation Data
10 pages
Scrapy Spider for CMS Violation Data
No ratings yet
Scrapy Spider for CMS Violation Data
10 pages
Company Listing Management System
No ratings yet
Company Listing Management System
3 pages
Company Data Extraction Tool
No ratings yet
Company Data Extraction Tool
7 pages
Subdomain Enumeration Tool in Python
No ratings yet
Subdomain Enumeration Tool in Python
11 pages
SEO Review Tools API Integration
No ratings yet
SEO Review Tools API Integration
15 pages
GraphQL Schema Dumping Script
No ratings yet
GraphQL Schema Dumping Script
5 pages
MCX India Spider Data Extraction Script
No ratings yet
MCX India Spider Data Extraction Script
24 pages
Pandas DataFrame Operations Guide
No ratings yet
Pandas DataFrame Operations Guide
5 pages
JSON to Excel Data Extraction Script
No ratings yet
JSON to Excel Data Extraction Script
3 pages
Excel Data Extraction Process
No ratings yet
Excel Data Extraction Process
4 pages
Data Processing and Analysis Techniques
No ratings yet
Data Processing and Analysis Techniques
18 pages
Scraping Store and Barcode Data
No ratings yet
Scraping Store and Barcode Data
9 pages
Google Scholar Citation Search Tool
No ratings yet
Google Scholar Citation Search Tool
6 pages
Web Scraping and Data Analysis Tool
No ratings yet
Web Scraping and Data Analysis Tool
10 pages
Python Data Engineering Essentials
No ratings yet
Python Data Engineering Essentials
25 pages
The 1 Python Script That Automates 90 of Your Data Cleaning
No ratings yet
The 1 Python Script That Automates 90 of Your Data Cleaning
9 pages
Data Extraction and Analysis Techniques
No ratings yet
Data Extraction and Analysis Techniques
50 pages
Dirty Data Cleaning Guide
No ratings yet
Dirty Data Cleaning Guide
8 pages
Data Cleaning for Investment Records
No ratings yet
Data Cleaning for Investment Records
2 pages
PSIII Exp14
No ratings yet
PSIII Exp14
4 pages
Python and SQL Practical Exercises Guide
No ratings yet
Python and SQL Practical Exercises Guide
35 pages
Python Car Scraper with PHP Interface
No ratings yet
Python Car Scraper with PHP Interface
14 pages
Cloud Run LLM Integration Guide
No ratings yet
Cloud Run LLM Integration Guide
12 pages
Pseudocode for Article Scraping
No ratings yet
Pseudocode for Article Scraping
14 pages
Scrapy: Key Concepts and Workflow Guide
No ratings yet
Scrapy: Key Concepts and Workflow Guide
3 pages
Saudi Real Estate Scraper Tool
No ratings yet
Saudi Real Estate Scraper Tool
23 pages
Machine Learning Lab Report IIIT Surat
No ratings yet
Machine Learning Lab Report IIIT Surat
11 pages
Day-22-Python-Writing Cleaned Data To CSV
No ratings yet
Day-22-Python-Writing Cleaned Data To CSV
5 pages
API Documentation for POST Method
No ratings yet
API Documentation for POST Method
2 pages
Flask API for Product Recommendations
No ratings yet
Flask API for Product Recommendations
26 pages
MCX India Spider Data Extraction Script
No ratings yet
MCX India Spider Data Extraction Script
13 pages
Django CRM Advanced Guide
No ratings yet
Django CRM Advanced Guide
72 pages
Advanced Python Practical Full
No ratings yet
Advanced Python Practical Full
3 pages
Retrieve Customer Data by ID
No ratings yet
Retrieve Customer Data by ID
87 pages
MCX India Membership Data Scraper
No ratings yet
MCX India Membership Data Scraper
17 pages
Python Data Analysis Guide for Beginners
No ratings yet
Python Data Analysis Guide for Beginners
14 pages
Facebook Cookie Management Script
No ratings yet
Facebook Cookie Management Script
44 pages
Data Wrangling with Python and Pandas
No ratings yet
Data Wrangling with Python and Pandas
87 pages
VK User Parser with Proxy Support
No ratings yet
VK User Parser with Proxy Support
6 pages
Serverless URL Shortener - AWS Console Step-By-Step Guide
No ratings yet
Serverless URL Shortener - AWS Console Step-By-Step Guide
32 pages
It FDSP 6-7
No ratings yet
It FDSP 6-7
22 pages
Personal Loan Eligibility Checker API
No ratings yet
Personal Loan Eligibility Checker API
5 pages
Stock Market Trading System Project
No ratings yet
Stock Market Trading System Project
55 pages
Python Assignment 1
No ratings yet
Python Assignment 1
5 pages
Pandas DataFrame and SQL Operations Guide
No ratings yet
Pandas DataFrame and SQL Operations Guide
13 pages
CS Practical Record for SHVShop
No ratings yet
CS Practical Record for SHVShop
53 pages
Excel and Data Preprocessing Techniques
No ratings yet
Excel and Data Preprocessing Techniques
114 pages
Unique String Analysis and Functions
No ratings yet
Unique String Analysis and Functions
7 pages
Essential Pandas Functions Cheat Sheet
No ratings yet
Essential Pandas Functions Cheat Sheet
9 pages
Scraping With The Shell Command: Duplicate Links Are Filtered, Which Is Handy. However, The Spider Is Wasting Resources
No ratings yet
Scraping With The Shell Command: Duplicate Links Are Filtered, Which Is Handy. However, The Spider Is Wasting Resources
15 pages
Web Crawler with Porter Stemmer and BeautifulSoup
No ratings yet
Web Crawler with Porter Stemmer and BeautifulSoup
10 pages
57 Changes Require
No ratings yet
57 Changes Require
1 page
MCX India Membership Data Scraper
No ratings yet
MCX India Membership Data Scraper
13 pages
Gov Process
No ratings yet
Gov Process
1 page
Vendor Compliance and Watchlist Data
No ratings yet
Vendor Compliance and Watchlist Data
23 pages
Business Name Variations and Aliases
No ratings yet
Business Name Variations and Aliases
1 page
77 Changes Need
No ratings yet
77 Changes Need
1 page
57 Changes Require
No ratings yet
57 Changes Require
1 page
MCX India Membership Data Scraper
No ratings yet
MCX India Membership Data Scraper
13 pages
Bosch Releaseletter VideoClient 1.7.6 SR2
No ratings yet
Bosch Releaseletter VideoClient 1.7.6 SR2
25 pages
Microcontrollers vs. Microprocessors Explained
100% (1)
Microcontrollers vs. Microprocessors Explained
11 pages
A Cor da Ternura PDF Download
83% (12)
A Cor da Ternura PDF Download
50 pages
OptiView Chiller Control Panel Software Update
100% (1)
OptiView Chiller Control Panel Software Update
3 pages
SPSS 18.0 Training Workshop Guide
No ratings yet
SPSS 18.0 Training Workshop Guide
32 pages
Database Concepts and LibreOffice Base Quiz
No ratings yet
Database Concepts and LibreOffice Base Quiz
3 pages
Worldwide Quarterly Mobile Phone Tracker: Technology Coverage
No ratings yet
Worldwide Quarterly Mobile Phone Tracker: Technology Coverage
2 pages
Class 10 IT Sample Question Paper
No ratings yet
Class 10 IT Sample Question Paper
7 pages
Impulse Response Analysis in MATLAB
No ratings yet
Impulse Response Analysis in MATLAB
4 pages
Comparing Cloud Services: AWS, Azure, GCP
No ratings yet
Comparing Cloud Services: AWS, Azure, GCP
2 pages
Strategic Writing for UX: A Review
No ratings yet
Strategic Writing for UX: A Review
2 pages
Using The SDRAM On Altera's DE0 Board With Verilog Designs: For Quartus II 13.1
No ratings yet
Using The SDRAM On Altera's DE0 Board With Verilog Designs: For Quartus II 13.1
14 pages
Integrating NAMASTE with ICD-11 EHRs
No ratings yet
Integrating NAMASTE with ICD-11 EHRs
6 pages
Competitive Coding Experiment 5
No ratings yet
Competitive Coding Experiment 5
9 pages
PALCE16V8 & PALCE16V8Z Logic Devices
No ratings yet
PALCE16V8 & PALCE16V8Z Logic Devices
32 pages
Multijet Plastic Printers
No ratings yet
Multijet Plastic Printers
4 pages
Pandas Series and DataFrame Guide
No ratings yet
Pandas Series and DataFrame Guide
20 pages
EzyShop: A MERN E-Commerce Solution
No ratings yet
EzyShop: A MERN E-Commerce Solution
37 pages
Duchemin's Lightroom Preset Guide
No ratings yet
Duchemin's Lightroom Preset Guide
18 pages
Overview of Mainframe Operating Systems
No ratings yet
Overview of Mainframe Operating Systems
4 pages
BCA Curriculum Framework at Bankura University
No ratings yet
BCA Curriculum Framework at Bankura University
131 pages
Original PDF Research Methods in Health Foundations For Evidenced Based Practice 2nd Edition Premium Unlock
100% (10)
Original PDF Research Methods in Health Foundations For Evidenced Based Practice 2nd Edition Premium Unlock
213 pages
Test Plan for Project Name Phase
No ratings yet
Test Plan for Project Name Phase
8 pages
ESMIG U2A Configurations Guide
No ratings yet
ESMIG U2A Configurations Guide
31 pages
Query Processing and Optimization Steps
50% (2)
Query Processing and Optimization Steps
12 pages
Uncycloedia
No ratings yet
Uncycloedia
16 pages
C++ Handwritten Notes PDF Download
No ratings yet
C++ Handwritten Notes PDF Download
88 pages
SpaceX Falcon 9 Landing Prediction
No ratings yet
SpaceX Falcon 9 Landing Prediction
46 pages
Senior Systems Engineer Profile
No ratings yet
Senior Systems Engineer Profile
4 pages
Essential Guide to Image Editing Techniques
No ratings yet
Essential Guide to Image Editing Techniques
7 pages

Scrapy Spider for NHTSA Violation Data

Uploaded by

Scrapy Spider for NHTSA Violation Data

Uploaded by

# import random

# Preset cookies required for session management and access

# Create the directory if it doesn't exist

# Send initial request with custom headers and cookies

def parse_listing(self, response):

for link in company_links:

yield [Link](link,meta=meta, headers=[Link],

# Handle pagination by identifying total pages and iterating through all

def parse_details(self, response):

# Extract all key-value details from the company profile page

item["PDP URL"] = [Link]

for path in [Link]('//div[@id="contentResult"]//b'):

# Store raw value first

# Special handling for Company field

dba_pattern = r'(.*?)\s+(trading as|dba|d/b/a|doing\s+business\

# Clean the aliases

# Check if alias_part contains company suffixes

# NEW LOGIC: If suffix found in alias_part, split into separate

value = main_name # Use just the main name part

# REST OF YOUR EXISTING CODE (UNCHANGED)

parens = [Link](r'\(([^()]+)\)', value)

for val in parens:

for val in parens:

# Extract aliases from other known keywords (fka, aka, etc.)

# 🔽 NEW: Extract additional info

for i, part in enumerate(split_aliases, start=idx):

# Handle Current Parent Company (similar to Company but without

# Handle special keys (Mega-Scandal, Source of Data)

# Handle link-based values (e.g., "click here")

# Additional handling for specific URL columns

# Reorder the fields before splitting/appending

for key in other_keys:

def split_company_names(self, item):

if len(parts) == 1: # No splitting occurred

while i < len(parts):

# Check if current or next_part ends with a keyword

if sep == ";" or current_has_keyword or next_has_keyword:

# Add the last remaining part

def extract_name_alias(self, entry):

return entry, country

def clean_text_punct(self, text):

# Process the text and preserve country names inside parentheses

for char in text:

return [Link](r'\s+', ' ', cleaned).strip()

def close(self, reason):

# Add common columns to both DataFrames

# Ensure source_url is second column

# Only add ID to uncleaned data (without Source URL)

# Save uncleaned file (completely raw data)

with [Link](filepath_uncleaned, engine='xlsxwriter',

# Process cleaned data

# 1. Replace N/A with blank except in protected columns

# Apply per-column logic

# Get all non-alias columns

# Reorder columns: non-alias first, then alias

# Save cleaned file

with [Link](filepath_cleaned, engine='xlsxwriter',

You might also like