diff --git a/eastmoney/eastmoney.py b/eastmoney/eastmoney.py index 4da2cc9..a305802 100644 --- a/eastmoney/eastmoney.py +++ b/eastmoney/eastmoney.py @@ -14,7 +14,7 @@ def run_detail2(code,name,url): - soup=getstart.geturl_utf8(url) + soup=geturl.geturl_utf8(url) tags=soup.find_all(class_='ui-font-middle ui-color-red ui-num') m1=tags[3].string y1=tags[4].string @@ -30,7 +30,7 @@ def run_detail2(code,name,url): def run_detail1(code,name,url): - soup=getstart.geturl_utf8(url) + soup=geturl.geturl_utf8(url) tags=soup.select('dd') try: m1=(tags[1].find_all('span')[1].string) @@ -46,7 +46,7 @@ def run_detail1(code,name,url): run_detail2(code,name,url) -soup=getstart.geturl_gbk(url) +soup=geturl.geturl_gbk(url) tags=soup.select('.num_right > li') for tag in tags: if tag.a is None: @@ -54,13 +54,13 @@ def run_detail1(code,name,url): else: content=tag.a.text code=re.findall(r'\d+',content)[0] - #print(code) + print(code) name=content.split(')')[1] - #print(name) + print(name) url=tag.a['href'] - #print(content) + print(content) content_dict={'code':code,'name':name,'url':url} - #print (content_dict) + print (content_dict) col1.insert(content_dict) time.sleep(0.1) run_detail1(code,name,url) diff --git a/eastmoney/fund_spider.py b/eastmoney/fund_spider.py new file mode 100644 index 0000000..0a67bac --- /dev/null +++ b/eastmoney/fund_spider.py @@ -0,0 +1,126 @@ +import requests +import pandas as pd +import re +import json +import time +import random +import math +from bs4 import BeautifulSoup + + +def get_fundcode(): + ''' + 获取fundcode列表 + :return: 将获取的DataFrame以csv格式存入本地 + ''' + url = 'http://fund.eastmoney.com/js/fundcode_search.js' + r = requests.get(url) + cont = re.findall('var r = (.*])', r.text)[0] # 提取list + ls = json.loads(cont) # 将字符串个事的list转化为list格式 + fundcode = pd.DataFrame(ls, columns=['fundcode', 'fundsx', 'name', 'category', 'fundpy']) # list转为DataFrame + fundcode = fundcode.loc[0:10, ['fundcode', 'name', 'category']] + #fundcode.to_csv('./fundcode.csv', index=False, encoding = 'gbk') + return fundcode + +def get_fundjbgk(): + fund_jbgk = [] + fund_list = get_fundcode() + for i in fund_list['fundcode']: + jbgk_addr = f'http://fundf10.eastmoney.com/jbgk_{i}.html' + g = requests.get(jbgk_addr) + g.encoding = g.apparent_encoding + s = BeautifulSoup(g.text, 'html.parser') + table = s.find('table', {'class': 'info w790'}) + temp_jbgk = [] + for row in table.findAll('tr'): + for col in row.findAll('td'): + temp_jbgk.append(col.get_text()) + if len(temp_jbgk) >= 16: + fund_jbgk.append(temp_jbgk[0:16]) + else: + print(temp_jbgk) + time.sleep(random.randint(1, 3)) + df_jbgk = pd.DataFrame(fund_jbgk, columns=['基金全称', '基金简称', '基金代码', '基金类型', '发行日期', '成立日期/规模', '资产规模', '份额规模', '基金管理人', '基金托管人', '基金经理人', '成立来分红', '管理费率', '托管费率', '销售服务费率', '最高认购费率']) + df_jbgk.to_csv('./fund_info.csv', index=False, encoding = 'gbk') + + +def get_one_page(fundcode, pageIndex=1): + ''' + 获取基金净值某一页的html + :param fundcode: str格式,基金代码 + :param pageIndex: int格式,页码数 + :return: str格式,获取网页内容 + ''' + url = 'http://api.fund.eastmoney.com/f10/lsjz' + cookie = 'EMFUND1=null; EMFUND2=null; EMFUND3=null; EMFUND4=null; EMFUND5=null; EMFUND6=null; EMFUND7=null; EMFUND8=null; EMFUND0=null; EMFUND9=01-24 17:11:50@#$%u957F%u4FE1%u5229%u5E7F%u6DF7%u5408A@%23%24519961; st_pvi=27838598767214; st_si=11887649835514' + headers = { + 'Cookie': cookie, + 'Host': 'api.fund.eastmoney.com', + 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36', + 'Referer': 'http://fundf10.eastmoney.com/jjjz_%s.html' % fundcode, + } + params = { + 'callback': 'jQuery18307633215694564663_1548321266367', + 'fundCode': fundcode, + 'pageIndex': pageIndex, + 'pageSize': 20, + } + try: + r = requests.get(url=url, headers=headers, params=params) + if r.status_code == 200: + return r.text + return None + except RequestException: + return None + + +def parse_one_page(html): + ''' + 解析网页内容 + :param html: str格式,html内容 + :return: dict格式,获取历史净值和访问页数 + ''' + if html is not None: # 判断内容是否为None + content = re.findall('\((.*?)\)', html)[0] # 提取网页文本内容中的数据部分 + lsjz_list = json.loads(content)['Data']['LSJZList'] # 获取历史净值列表 + total_count = json.loads(content)['TotalCount'] # 获取数据量 + total_page = math.ceil(total_count / 20) # + lsjz = pd.DataFrame(lsjz_list) + info = {'lsjz': lsjz, + 'total_page': total_page} + return info + return None + + +def main(fundcode): + ''' + 将爬取的基金净值数据储存至本地csv文件 + ''' + html = get_one_page(fundcode) + info = parse_one_page(html) + total_page = info['total_page'] + lsjz = info['lsjz'] + lsjz.to_csv('./%s_lsjz.csv' % fundcode, index=False, encoding = 'gbk') # 将基金历史净值以csv格式储存 + page = 1 + while page < total_page: + page += 1 + print(lsjz) + html = get_one_page(fundcode, pageIndex=page) + info = parse_one_page(html) + if info is None: + break + lsjz = info['lsjz'] + lsjz.to_csv('./%s_lsjz.csv' % fundcode, mode='a', index=False, header=False, encoding = 'gbk') # 追加存储 + time.sleep(random.randint(3, 5)) + + +if __name__=='__main__': + # 获取所有基金代码 + get_fundjbgk() + # # fundcode = '519961' + # fundcodes = pd.read_csv('./fundcode.csv', converters={'fundcode': str}) + # # 获取所有基金净值数据 + # for fundcode in fundcodes['fundcode']: + # print(fundcode) + # main(fundcode) + # time.sleep(random.randint(5, 10)) \ No newline at end of file diff --git a/eastmoney/geturl.py b/eastmoney/geturl.py index 3213b4b..a8b02a8 100644 --- a/eastmoney/geturl.py +++ b/eastmoney/geturl.py @@ -5,10 +5,13 @@ UA_LIST = [ "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24" ] header={ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate, sdch', 'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6', 'Connection': 'keep-alive','User-Agent': random.choice(UA_LIST) } -proxies=['http://118.178.124.33:3128', -'http://139.129.166.68:3128', -'http://61.163.39.70:9999', -'http://61.143.228.162'] +# proxies=['http://118.178.124.33:3128', +# 'http://139.129.166.68:3128', +# 'http://61.163.39.70:9999', +# 'http://61.143.228.162'] + +proxies=['http://61.135.217.7:80'] + def geturl_gbk(url): html=requests.get(url,headers=header,proxies={'http':random.choice(proxies)}).content.decode('gbk') soup=BeautifulSoup(html,'lxml')