From 83e3429b45a013dbdb429d9f180f9b1968aba8cc Mon Sep 17 00:00:00 2001 From: caocheng Date: Sun, 7 Jun 2020 13:44:30 +0800 Subject: [PATCH 1/4] update code --- eastmoney/eastmoney.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/eastmoney/eastmoney.py b/eastmoney/eastmoney.py index 4da2cc9..9d5dddc 100644 --- a/eastmoney/eastmoney.py +++ b/eastmoney/eastmoney.py @@ -14,7 +14,7 @@ def run_detail2(code,name,url): - soup=getstart.geturl_utf8(url) + soup=geturl.geturl_utf8(url) tags=soup.find_all(class_='ui-font-middle ui-color-red ui-num') m1=tags[3].string y1=tags[4].string From c2350469d1835274f316cbd3245a9030455e7aea Mon Sep 17 00:00:00 2001 From: caocheng Date: Sun, 7 Jun 2020 13:45:55 +0800 Subject: [PATCH 2/4] update code --- eastmoney/eastmoney.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/eastmoney/eastmoney.py b/eastmoney/eastmoney.py index 9d5dddc..ad77c1e 100644 --- a/eastmoney/eastmoney.py +++ b/eastmoney/eastmoney.py @@ -30,7 +30,7 @@ def run_detail2(code,name,url): def run_detail1(code,name,url): - soup=getstart.geturl_utf8(url) + soup=geturl.geturl_utf8(url) tags=soup.select('dd') try: m1=(tags[1].find_all('span')[1].string) @@ -46,7 +46,7 @@ def run_detail1(code,name,url): run_detail2(code,name,url) -soup=getstart.geturl_gbk(url) +soup=geturl.geturl_gbk(url) tags=soup.select('.num_right > li') for tag in tags: if tag.a is None: From 9d4531870e511b228db71b65fb3ec008d310c1f7 Mon Sep 17 00:00:00 2001 From: caocheng Date: Sat, 27 Jun 2020 16:48:30 +0800 Subject: [PATCH 3/4] update code --- eastmoney/eastmoney.py | 8 +-- eastmoney/fund_spider.py | 123 +++++++++++++++++++++++++++++++++++++++ eastmoney/geturl.py | 11 ++-- eastmoney/test.py | 88 ++++++++++++++++++++++++++++ 4 files changed, 222 insertions(+), 8 deletions(-) create mode 100644 eastmoney/fund_spider.py create mode 100644 eastmoney/test.py diff --git a/eastmoney/eastmoney.py b/eastmoney/eastmoney.py index ad77c1e..a305802 100644 --- a/eastmoney/eastmoney.py +++ b/eastmoney/eastmoney.py @@ -54,13 +54,13 @@ def run_detail1(code,name,url): else: content=tag.a.text code=re.findall(r'\d+',content)[0] - #print(code) + print(code) name=content.split(')')[1] - #print(name) + print(name) url=tag.a['href'] - #print(content) + print(content) content_dict={'code':code,'name':name,'url':url} - #print (content_dict) + print (content_dict) col1.insert(content_dict) time.sleep(0.1) run_detail1(code,name,url) diff --git a/eastmoney/fund_spider.py b/eastmoney/fund_spider.py new file mode 100644 index 0000000..d996620 --- /dev/null +++ b/eastmoney/fund_spider.py @@ -0,0 +1,123 @@ +import requests +import pandas as pd +import re +import json +import time +import random +import math +from bs4 import BeautifulSoup + + +def get_fundcode(): + ''' + 获取fundcode列表 + :return: 将获取的DataFrame以csv格式存入本地 + ''' + url = 'http://fund.eastmoney.com/js/fundcode_search.js' + r = requests.get(url) + cont = re.findall('var r = (.*])', r.text)[0] # 提取list + ls = json.loads(cont) # 将字符串个事的list转化为list格式 + fundcode = pd.DataFrame(ls, columns=['fundcode', 'fundsx', 'name', 'category', 'fundpy']) # list转为DataFrame + fundcode = fundcode.loc[0:100, ['fundcode', 'name', 'category']] + #fundcode.to_csv('./fundcode.csv', index=False, encoding = 'gbk') + return fundcode + +def get_fundjbgk(): + fund_jbgk = [] + fund_list = get_fundcode() + for i in fund_list['fundcode']: + jbgk_addr = f'http://fundf10.eastmoney.com/jbgk_{i}.html' + g = requests.get(jbgk_addr) + g.encoding = g.apparent_encoding + s = BeautifulSoup(g.text, 'html.parser') + table = s.find('table', {'class': 'info w790'}) + temp_jbgk = [] + for row in table.findAll('tr'): + for col in row.findAll('td'): + temp_jbgk.append(col.get_text()) + fund_jbgk.append(temp_jbgk) + time.sleep(random.randint(1, 3)) + df_jbgk = pd.DataFrame(fund_jbgk, columns=['基金全称', '基金简称', '基金代码', '基金类型', '发行日期', '成立日期/规模', '资产规模', '份额规模', '基金管理人', '基金托管人', '基金经理人', '成立来分红', '管理费率', '托管费率', '销售服务费率', '最高认购费率']) + df_jbgk.to_csv('./fund_info.csv', index=False, encoding = 'gbk') + + +def get_one_page(fundcode, pageIndex=1): + ''' + 获取基金净值某一页的html + :param fundcode: str格式,基金代码 + :param pageIndex: int格式,页码数 + :return: str格式,获取网页内容 + ''' + url = 'http://api.fund.eastmoney.com/f10/lsjz' + cookie = 'EMFUND1=null; EMFUND2=null; EMFUND3=null; EMFUND4=null; EMFUND5=null; EMFUND6=null; EMFUND7=null; EMFUND8=null; EMFUND0=null; EMFUND9=01-24 17:11:50@#$%u957F%u4FE1%u5229%u5E7F%u6DF7%u5408A@%23%24519961; st_pvi=27838598767214; st_si=11887649835514' + headers = { + 'Cookie': cookie, + 'Host': 'api.fund.eastmoney.com', + 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36', + 'Referer': 'http://fundf10.eastmoney.com/jjjz_%s.html' % fundcode, + } + params = { + 'callback': 'jQuery18307633215694564663_1548321266367', + 'fundCode': fundcode, + 'pageIndex': pageIndex, + 'pageSize': 20, + } + try: + r = requests.get(url=url, headers=headers, params=params) + if r.status_code == 200: + return r.text + return None + except RequestException: + return None + + +def parse_one_page(html): + ''' + 解析网页内容 + :param html: str格式,html内容 + :return: dict格式,获取历史净值和访问页数 + ''' + if html is not None: # 判断内容是否为None + content = re.findall('\((.*?)\)', html)[0] # 提取网页文本内容中的数据部分 + lsjz_list = json.loads(content)['Data']['LSJZList'] # 获取历史净值列表 + total_count = json.loads(content)['TotalCount'] # 获取数据量 + total_page = math.ceil(total_count / 20) # + lsjz = pd.DataFrame(lsjz_list) + info = {'lsjz': lsjz, + 'total_page': total_page} + return info + return None + + +def main(fundcode): + ''' + 将爬取的基金净值数据储存至本地csv文件 + ''' + html = get_one_page(fundcode) + info = parse_one_page(html) + total_page = info['total_page'] + lsjz = info['lsjz'] + lsjz.to_csv('./%s_lsjz.csv' % fundcode, index=False, encoding = 'gbk') # 将基金历史净值以csv格式储存 + page = 1 + while page < total_page: + page += 1 + print(lsjz) + html = get_one_page(fundcode, pageIndex=page) + info = parse_one_page(html) + if info is None: + break + lsjz = info['lsjz'] + lsjz.to_csv('./%s_lsjz.csv' % fundcode, mode='a', index=False, header=False, encoding = 'gbk') # 追加存储 + time.sleep(random.randint(3, 5)) + + +if __name__=='__main__': + # 获取所有基金代码 + get_fundjbgk() + # # fundcode = '519961' + # fundcodes = pd.read_csv('./fundcode.csv', converters={'fundcode': str}) + # # 获取所有基金净值数据 + # for fundcode in fundcodes['fundcode']: + # print(fundcode) + # main(fundcode) + # time.sleep(random.randint(5, 10)) \ No newline at end of file diff --git a/eastmoney/geturl.py b/eastmoney/geturl.py index 3213b4b..a8b02a8 100644 --- a/eastmoney/geturl.py +++ b/eastmoney/geturl.py @@ -5,10 +5,13 @@ UA_LIST = [ "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24" ] header={ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate, sdch', 'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6', 'Connection': 'keep-alive','User-Agent': random.choice(UA_LIST) } -proxies=['http://118.178.124.33:3128', -'http://139.129.166.68:3128', -'http://61.163.39.70:9999', -'http://61.143.228.162'] +# proxies=['http://118.178.124.33:3128', +# 'http://139.129.166.68:3128', +# 'http://61.163.39.70:9999', +# 'http://61.143.228.162'] + +proxies=['http://61.135.217.7:80'] + def geturl_gbk(url): html=requests.get(url,headers=header,proxies={'http':random.choice(proxies)}).content.decode('gbk') soup=BeautifulSoup(html,'lxml') diff --git a/eastmoney/test.py b/eastmoney/test.py new file mode 100644 index 0000000..4a04127 --- /dev/null +++ b/eastmoney/test.py @@ -0,0 +1,88 @@ +import requests +import json +from time import time +from lxml import etree +import re +from random import sample +from time import sleep + +np = 1 +base = "http://fund.eastmoney.com/" +acount = 2 +# 爬取排行页 +headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.104 Safari/537.36 Core/1.53.4620.400 QQBrowser/9.7.13014.400' +} +url = "http://fund.eastmoney.com/data/rankhandler.aspx?op=ph&dt=kf&ft=all&rs=&gs=0&sc=zzf&st=desc&sd=2017-07-09&ed=2018-07-09&qdii=&tabSubtype=,,,,,&pi=1&pn=50&dx=1&v=0." + + +def IndexSpider(url, headers): # 爬取第一个页面信息 + url = url + str(int(time())) # 这个主要是url最后一个值是v=数字,我就用时间戳来伪装了 + rsp = requests.get(url, headers=headers).content + html = rsp.decode('utf-8') + url = url[:-10] # 请求完了之后减掉加的时间戳,方面爬取下一页的时候重复操作 + return html + + +def ChangeUrl(url): # 改变url,pi代表的是页数,这样可以按顺序爬取相应页数的数据 + global acount + url = url.replace("&pi=1", "&pi=" + str(acount)) + acount = acount + 1 + return url + + +def ChangeUrl_2(jijindaima): # 我要爬取相应基金点开之后的页面的数据,分析可知是对应基金代码前面加上域名,后面加上.html + global base + jijindaima = jijindaima.replace('\"', '') + url_2 = base + jijindaima + '.html' + return url_2 + + +def DetailRequest(url): # 爬取点开那一页之后的数据 + global np + url = url.replace('\"', '') + print(url) + print("正在爬取第{0}条记录".format(np)) + np = np + 1 + re_leixing = re.compile('基金类型(.*?)') + re_jingli = re.compile('基金经理:') + re_chengliri = re.compile('成 立 日:(.*?)') + rsp = requests.get(url, headers=headers).content + html = rsp.decode('utf-8') + leixing = re_leixing.findall(html)[0][-3:] + jingli = re_jingli.findall(html)[0][-2:] + chengliri = re_chengliri.findall(html)[0] + return jingli, leixing, chengliri + + +if __name__ == '__main__': + nw = 1 + url2_detail = [] + jijindaima_list = [] + detail_url_list = [] + with open('w.txt', 'a', encoding='utf-8') as f: + f.write("基金代码\t\t基金简称\t\t单位净值\t\t累计净值\t\t基金经理\t\t基金类型\t\t成立日\n") + for i in range(1, 32): + html = IndexSpider(url, headers=headers) + url = ChangeUrl(url) + right = html.find("]") + left = html.find("[") + html = html[left + 1:right] + lists = html.split("\",\"") + for list in lists: + l = list.split(",") + jijindaima_list.append(l[0]) + for i in jijindaima_list: + detail_url_list.append(ChangeUrl_2(i)) + for i in detail_url_list: + url2_detail.append(DetailRequest(i)) + with open('w.txt', 'a', encoding='utf-8') as f: + for list, l2 in zip(lists, url2_detail): + l = list.split(",") + f.writelines( + l[0] + '\t\t' + l[1] + '\t\t' + l[4] + '\t\t' + l[5] + '\t\t' + l2[0] + '\t\t' + l2[1] + '\t\t' + + l2[2] + '\n') + print('正在写入第{0}条记录……'.format(nw)) + nw = nw + 1 + print("5秒后爬取下一页……") + sleep(5) \ No newline at end of file From a6a412ef972ff8b679010294482087a8523a0587 Mon Sep 17 00:00:00 2001 From: caocheng Date: Sat, 27 Jun 2020 17:43:08 +0800 Subject: [PATCH 4/4] update code --- eastmoney/fund_spider.py | 7 +++- eastmoney/test.py | 88 ---------------------------------------- 2 files changed, 5 insertions(+), 90 deletions(-) delete mode 100644 eastmoney/test.py diff --git a/eastmoney/fund_spider.py b/eastmoney/fund_spider.py index d996620..0a67bac 100644 --- a/eastmoney/fund_spider.py +++ b/eastmoney/fund_spider.py @@ -18,7 +18,7 @@ def get_fundcode(): cont = re.findall('var r = (.*])', r.text)[0] # 提取list ls = json.loads(cont) # 将字符串个事的list转化为list格式 fundcode = pd.DataFrame(ls, columns=['fundcode', 'fundsx', 'name', 'category', 'fundpy']) # list转为DataFrame - fundcode = fundcode.loc[0:100, ['fundcode', 'name', 'category']] + fundcode = fundcode.loc[0:10, ['fundcode', 'name', 'category']] #fundcode.to_csv('./fundcode.csv', index=False, encoding = 'gbk') return fundcode @@ -35,7 +35,10 @@ def get_fundjbgk(): for row in table.findAll('tr'): for col in row.findAll('td'): temp_jbgk.append(col.get_text()) - fund_jbgk.append(temp_jbgk) + if len(temp_jbgk) >= 16: + fund_jbgk.append(temp_jbgk[0:16]) + else: + print(temp_jbgk) time.sleep(random.randint(1, 3)) df_jbgk = pd.DataFrame(fund_jbgk, columns=['基金全称', '基金简称', '基金代码', '基金类型', '发行日期', '成立日期/规模', '资产规模', '份额规模', '基金管理人', '基金托管人', '基金经理人', '成立来分红', '管理费率', '托管费率', '销售服务费率', '最高认购费率']) df_jbgk.to_csv('./fund_info.csv', index=False, encoding = 'gbk') diff --git a/eastmoney/test.py b/eastmoney/test.py deleted file mode 100644 index 4a04127..0000000 --- a/eastmoney/test.py +++ /dev/null @@ -1,88 +0,0 @@ -import requests -import json -from time import time -from lxml import etree -import re -from random import sample -from time import sleep - -np = 1 -base = "http://fund.eastmoney.com/" -acount = 2 -# 爬取排行页 -headers = { - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.104 Safari/537.36 Core/1.53.4620.400 QQBrowser/9.7.13014.400' -} -url = "http://fund.eastmoney.com/data/rankhandler.aspx?op=ph&dt=kf&ft=all&rs=&gs=0&sc=zzf&st=desc&sd=2017-07-09&ed=2018-07-09&qdii=&tabSubtype=,,,,,&pi=1&pn=50&dx=1&v=0." - - -def IndexSpider(url, headers): # 爬取第一个页面信息 - url = url + str(int(time())) # 这个主要是url最后一个值是v=数字,我就用时间戳来伪装了 - rsp = requests.get(url, headers=headers).content - html = rsp.decode('utf-8') - url = url[:-10] # 请求完了之后减掉加的时间戳,方面爬取下一页的时候重复操作 - return html - - -def ChangeUrl(url): # 改变url,pi代表的是页数,这样可以按顺序爬取相应页数的数据 - global acount - url = url.replace("&pi=1", "&pi=" + str(acount)) - acount = acount + 1 - return url - - -def ChangeUrl_2(jijindaima): # 我要爬取相应基金点开之后的页面的数据,分析可知是对应基金代码前面加上域名,后面加上.html - global base - jijindaima = jijindaima.replace('\"', '') - url_2 = base + jijindaima + '.html' - return url_2 - - -def DetailRequest(url): # 爬取点开那一页之后的数据 - global np - url = url.replace('\"', '') - print(url) - print("正在爬取第{0}条记录".format(np)) - np = np + 1 - re_leixing = re.compile('基金类型(.*?)') - re_jingli = re.compile('基金经理:') - re_chengliri = re.compile('成 立 日:(.*?)') - rsp = requests.get(url, headers=headers).content - html = rsp.decode('utf-8') - leixing = re_leixing.findall(html)[0][-3:] - jingli = re_jingli.findall(html)[0][-2:] - chengliri = re_chengliri.findall(html)[0] - return jingli, leixing, chengliri - - -if __name__ == '__main__': - nw = 1 - url2_detail = [] - jijindaima_list = [] - detail_url_list = [] - with open('w.txt', 'a', encoding='utf-8') as f: - f.write("基金代码\t\t基金简称\t\t单位净值\t\t累计净值\t\t基金经理\t\t基金类型\t\t成立日\n") - for i in range(1, 32): - html = IndexSpider(url, headers=headers) - url = ChangeUrl(url) - right = html.find("]") - left = html.find("[") - html = html[left + 1:right] - lists = html.split("\",\"") - for list in lists: - l = list.split(",") - jijindaima_list.append(l[0]) - for i in jijindaima_list: - detail_url_list.append(ChangeUrl_2(i)) - for i in detail_url_list: - url2_detail.append(DetailRequest(i)) - with open('w.txt', 'a', encoding='utf-8') as f: - for list, l2 in zip(lists, url2_detail): - l = list.split(",") - f.writelines( - l[0] + '\t\t' + l[1] + '\t\t' + l[4] + '\t\t' + l[5] + '\t\t' + l2[0] + '\t\t' + l2[1] + '\t\t' + - l2[2] + '\n') - print('正在写入第{0}条记录……'.format(nw)) - nw = nw + 1 - print("5秒后爬取下一页……") - sleep(5) \ No newline at end of file