广州的网站建设公司,深圳建设交易工程信息网,sql如何建设网站数据库,semseo是什么意思1.从新闻url获取新闻详情#xff1a; 字典,anews 2.从列表页的url获取新闻url#xff1a;列表append(字典) alist 3.生成所页列表页的url并获取全部新闻 #xff1a;列表extend(列表) allnews *每个同学爬学号尾数开始的10个列表页 4.设置合理的爬取间隔 import time import…1.从新闻url获取新闻详情 字典,anews 2.从列表页的url获取新闻url列表append(字典) alist 3.生成所页列表页的url并获取全部新闻 列表extend(列表) allnews *每个同学爬学号尾数开始的10个列表页 4.设置合理的爬取间隔 import time import random time.sleep(random.random()*3) 5.用pandas做简单的数据处理并保存 保存到csv或excel文件 newsdf.to_csv(rF:\duym\爬虫\gzccnews.csv) 保存到数据库 import sqlite3with sqlite3.connect(gzccnewsdb.sqlite) as db: newsdf.to_sql(gzccnewsdb,db) # -*- coding: utf-8 -*-Created on Thu Apr 11 12:33:03 2019author: Administrator
import requests
from bs4 import BeautifulSoup
from datetime import datetime
import re
import pandas as pd
import time
import random
import sqlite3newsUrl http://news.gzcc.cn/html/2005/xiaoyuanxinwen_0710/4.html
listUrl http://news.gzcc.cn/html/xiaoyuanxinwen/def click(url):id re.findall((\d{1,5}), url)[-1]clickUrl http://oa.gzcc.cn/api.php?opcountid{}modelid80.format(id)resClick requests.get(clickUrl)newsClick int(resClick.text.split(.html)[-1].lstrip(().rstrip();))return newsClickdef newsdt(showinfo):newsDate showinfo.split()[0].split(:)[1]newsTime showinfo.split()[1]newsDT newsDate newsTimedt datetime.strptime(newsDT, %Y-%m-%d %H:%M:%S)return dtdef anews(url)newsDetail {}res requests.get(url)res.encoding utf-8soup BeautifulSoup(res.text, html.parser)newsDetail[newsTitle] soup.select(.show-title)[0].textshowinfo soup.select(.show-info)[0].textnewsDetail[newsDT] newsdt(showinfo)newsDetail[newsClick] click(newsUrl)return newsDetaildef alist(url):res requests.get(listUrl)res.encoding utf-8soup BeautifulSoup(res.text, html.parser)newsList []for news in soup.select(li):if len(news.select(.news-list-title)) 0:newsUrl news.select(a)[0][href]newsDesc news.select(.news-list-description)[0].textnewsDict anews(newsUrl)newsDict[description] newsDescnewsList.append(newsDict)return newsListalist(listUrl)alist(newsUrl)
res requests.get(http://news.gzcc.cn/html/xiaoyuanxinwen/)
res.encoding utf-8
soup BeautifulSoup(res.text, html.parser)for news in soup.select(li):if len(news.select(.news-list-title)) 0:newsUrl news.select(a)[0][href]print(anews(newsUrl))allnews []
for i in range(57, 67):listUrl http://news.gzcc.cn/html/xiaoyuanxinwen/{}.html.format(i)allnews.extend(alist(listUrl))print(allnewsLength{}.format(len(allnews)))
print(allnews)res requests.get(http://news.gzcc.cn/html/xiaoyuanxinwen/)
res.encoding utf-8
soup BeautifulSoup(res.text, html.parser)
for news in soup.select(li):if len(news.select(.news-list-title)) 0:newsUrl news.select(a)[0][href]print(anews(newsUrl))s1 pd.Series([100, 23, bugingcode])
print(s1)
pd.Series(anews)
newsdf pd.DataFrame(allnews)
for i in range(5):print(i)time.sleep(random.random() * 3)print(newsdf)newsdf.to_csv(rD:\py_file\gzcc.csv,encodingutf_8_sig)with sqlite3.connect(rD:\py_file\gzccnewsdb.sqlite) as db:newsdf.to_sql(gzccnewsdb,db)转载于:https://www.cnblogs.com/gswyz/p/10688905.html