求
import requests
url = "www.baidu.com"
resp = requests.get(url)
htmls = resp.text
beautifulsoup系列
from bs4 import BeautifulSoup
soup = BeautifulSoup(htmls, "lxml")
soup.find("a",class_="title",id="t1",attrs={"alog-action": "qb-ask-uname"}))
soup.find("div").get_text()
str(soup.find("div").get_text()).strip()
for i in soup.find_all("div",limit = 5)
print(i.get_text())
正则系列
rollback({ "response": { "code": "0", "msg": "Success", "dext": "" }, "data": { "count": 3, "page": 1, "article_info": [{ "title": "“小库里”:适应比赛是首要任务 投篮终会找到节奏", "url": "http:\/\/sports.qq.com\/a\/20180704\/035378.htm", "time": "2018-07-04 16:58:36", "column": "NBA", "img": "", "desc": "" }, { "title": "首钢体育助力国家冰球集训队 中国冰球联赛年底启动", "url": "http:\/\/sports.qq.com\/a\/20180704\/034698.htm", "time": "2018-07-04 16:34:44", "column": "综合体育", "img": "", "desc": "" }...] } }) import re # 提取这个json中的每条新闻的title、url #(.*?)为要提取的内容,可以在正则字符串中加入.*?表示中间省略若干字符 reg_str = r'"title":"(.*?)",.*?"url":"(.*?)"' pattern = re.compile(reg_str,re.DOTALL) items = re.findall(pattern,htmls) for i in items: tilte = i[0] url = i[1]
过滤html标签,保留标签里的内容
import re
htmls = "<p>abc</p>"
dr = re.compile(r'<[^>]+>',re.S)
htmls2 = dr.sub('',htmls)
print(htmls2) #abc
过滤script和style标签,标签里的内容也需过滤掉
import requests
from bs4 import BeautifulSoup
url = "http://new.qq.com/omn/20180705/20180705A0920X.html"
r = requests.get(url)
htmls = r.text
soup = BeautifulSoup(htmls, "lxml")
for script in soup(["script", "style"]):
script.extract()
print(soup)
日期、时间的处理
import datetime
import time
# 获取当前年月日
today = datetime.date.today()
print(today) #2018-07-05
# 获取当前时间并格式化
time_now = time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(time.time()))
print(time_now) #2018-07-05 14:20:55
# 对某个时间戳a格式化
a = 1502691655
time_a = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(a)))
print(time_a) #2017-08-14 14:20:55
#时间的计算
#明天的日期
today = datetime.date.today()
tomorrow = today + datetime.timedelta(days=1)
print(tomorrow) #2018-07-06
#三天前的时间
today = datetime.datetime.today()
tomorrow = today + datetime.timedelta(days=-3)
print(tomorrow) #2018-07-02 13:37:00.107703
#计算时间差
start = "2018-07-03 00:00:00"
time_now = datetime.datetime.now()
b = datetime.datetime.strptime(start,'%Y-%m-%d %H:%M:%S')
minutes = (time_now-b).seconds/60
days = (time_now-b).days
all_minutes = days*24*60+minutes
print(minutes) #821.7666666666667
print(days) #2
print(all_minutes) #3701.7666666666664
base64编码与解码
import base64 content = "abc124我是" contents_base64 = base64.b64encode(content.encode('utf-8','ignore')).decode("utf-8") contents = base64.b64decode(contents_base64) url中的中文解码 import urllib url = "www.baidu.com?wb =%e8%85" result_url = urllib.parse.unquote(soup3)
天小编就为大家分享一篇关于Python常用爬虫代码总结方便查询,觉得内容挺不错的,现在分享给大家,具有很好的参考价值,需要的朋友一起跟随小编来看看吧
1、beautifulsoup解析页面
from bs4 import BeautifulSoup
soup = BeautifulSoup(htmltxt, "lxml")
# 三种装载器
soup = BeautifulSoup("<a></p>", "html.parser")
### 只有起始标签的会自动补全,只有结束标签的会自动忽略
### 结果为:<a></a>
soup = BeautifulSoup("<a></p>", "lxml")
### 结果为:<html><body><a></a></body></html>
soup = BeautifulSoup("<a></p>", "html5lib")
### html5lib则出现一般的标签都会自动补全
### 结果为:<html><head></head><body><a><p></p></a></body></html>
# 根据标签名、id、class、属性等查找标签
### 根据class、id、以及属性alog-action的值和标签类别查询
soup.find("a",class_="title",id="t1",attrs={"alog-action": "qb-ask-uname"}))
### 查询标签内某属性的值
pubtime = soup.find("meta",attrs={"itemprop":"datePublished"}).attrs['content']
### 获取所有class为title的标签
for i in soup.find_all(class_="title"):
print(i.get_text())
### 获取特定数量的class为title的标签
for i in soup.find_all(class_="title",limit = 2):
print(i.get_text())
### 获取文本内容时可以指定不同标签之间的分隔符,也可以选择是否去掉前后的空白。
soup = BeautifulSoup('<p class="title" id="p1"><b> The Dormouses story </b></p><p class="title" id="p1"><b>The Dormouses story</b></p>', "html5lib")
soup.find(class_="title").get_text("|", strip=True)
#结果为:The Dormouses story|The Dormouses story
### 获取class为title的p标签的id
soup.find(class_="title").get("id")
### 对class名称正则:
soup.find_all(class_=re.compile("tit"))
### recursive参数,recursive=False时,只find当前标签的第一级子标签的数据
soup = BeautifulSoup('<html><head><title>abc','lxml')
soup.html.find_all("title", recursive=False)
2、unicode编码转中文
content = "\u65f6\u75c7\u5b85"
content = content.encode("utf8","ignore").decode('unicode_escape')
3、url encode的解码与解码
from urllib import parse
# 编码
x = "中国你好"
y = parse.quote(x)
print(y)
# 解码
x = parse.unquote(y)
print(x)
4、html转义字符的解码
from html.parser import HTMLParser
htmls = "<div><p>"
txt = HTMLParser().unescape(htmls)
print(txt) . # 输出<div><p>
5、base64的编码与解码
import base64
# 编码
content = "测试转码文本123"
contents_base64 = base64.b64encode(content.encode('utf-8','ignore')).decode("utf-8")
# 解码
contents = base64.b64decode(contents_base64)
6、过滤emoji表情
def filter_emoji(desstr,restr=''):
try:
co = re.compile(u'[U00010000-U0010ffff]')
except re.error:
co = re.compile(u'[\uD800-\uDBFF][\uDC00-\uDFFF]')
return co.sub(restr, desstr)
7、完全过滤script和style标签
import requests
from bs4 import BeautifulSoup
soup = BeautifulSoup(htmls, "lxml")
for script in soup(["script", "style"]):
script.extract()
print(soup)
8、过滤html的标签,但保留标签里的内容
import re
htmls = "<p>abc</p>"
dr = re.compile(r'<[^>]+>',re.S)
htmls2 = dr.sub('',htmls)
print(htmls2) #abc
正则提取内容(一般处理json)
rollback({
"response": {
"code": "0",
"msg": "Success",
"dext": ""
},
"data": {
"count": 3,
"page": 1,
"article_info": [{
"title": "“小库里”:适应比赛是首要任务 投篮终会找到节奏",
"url": "http://sports.qq.com/a/20180704/035378.htm",
"time": "2018-07-04 16:58:36",
"column": "NBA",
"img": "",
"desc": ""
}, {
"title": "首钢体育助力国家冰球集训队 中国冰球联赛年底启动",
"url": "http://sports.qq.com/a/20180704/034698.htm",
"time": "2018-07-04 16:34:44",
"column": "综合体育",
"img": "",
"desc": ""
}...]
}
})
import re
# 提取这个json中的每条新闻的title、url
# (.*?)为要提取的内容,可以在正则字符串中加入.*?表示中间省略若干字符
reg_str = r'"title":"(.*?)",.*?"url":"(.*?)"'
pattern = re.compile(reg_str,re.DOTALL)
items = re.findall(pattern,htmls)
for i in items:
tilte = i[0]
url = i[1]
9、时间操作
# 获取当前日期
today = datetime.date.today()
print(today) #2018-07-05
# 获取当前时间并格式化
time_now = time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(time.time()))
print(time_now) #2018-07-05 14:20:55
# 对时间戳格式化
a = 1502691655
time_a = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(a)))
print(time_a) #2017-08-14 14:20:55
# 字符串转为datetime类型
str = "2018-07-01 00:00:00"
datetime.datetime.strptime(st, "%Y-%m-%d %H:%M:%S")
# 将时间转化为时间戳
time_line = "2018-07-16 10:38:50"
time_tuple = time.strptime(time_line, "%Y-%m-%d %H:%M:%S")
time_line2 = int(time.mktime(time_tuple))
# 明天的日期
today = datetime.date.today()
tomorrow = today + datetime.timedelta(days=1)
print(tomorrow) #2018-07-06
# 三天前的时间
today = datetime.datetime.today()
tomorrow = today + datetime.timedelta(days=-3)
print(tomorrow) #2018-07-02 13:37:00.107703
# 计算时间差
start = "2018-07-03 00:00:00"
time_now = datetime.datetime.now()
b = datetime.datetime.strptime(start,'%Y-%m-%d %H:%M:%S')
minutes = (time_now-b).seconds/60
days = (time_now-b).days
all_minutes = days*24*60+minutes
print(minutes) #821.7666666666667
print(days) #2
print(all_minutes) #3701.7666666666664
10、数据库操作
import pymysql
conn = pymysql.connect(host='10.0.8.81', port=3306, user='root', passwd='root',db='xxx', charset='utf8')
cur = conn.cursor()
insert_sql = "insert into tbl_name(id,name,age) values(%s,%s,%s)
id = 1
name = "like"
age = 26
data_list = []
data = (id,name,age)
# 单条插入
cur.execute(insert_sql,data)
conn.commit()
# 批量插入
data_list.append(data)
cur.executemany(insert_sql,data_list)
conn.commit()
#特殊字符处理(name中含有特殊字符)
data = (id,pymysql.escape_string(name),age)
#更新
update_sql = "update tbl_name set content = '%s' where id = "+str(id)
cur.execute(update_sql%(pymysql.escape_string(content)))
conn.commit()
#批量更新
update_sql = "UPDATE tbl_recieve SET content = %s ,title = %s , is_spider = %s WHERE id = %s"
update_data = (contents,title,is_spider,one_new[0])
update_data_list.append(update_data)
if len(update_data_list) > 500:
try:
cur.executemany(update_sql,update_data_list)
conn.commit()
以上就是小编今天为大家总结的一些Python常用的爬虫代码。
器之心编译
参与:Ellen Han、吴攀
在深度学习中,循环神经网络(RNN)是一系列善于从序列数据中学习的神经网络。由于对长期依赖问题的鲁棒性,长短期记忆(LSTM)是一类已经有实际应用的循环神经网络。现在已有大量关于LSTM的文章和文献,其中推荐如下两篇:
Goodfellow et.al. 《深度学习》一书第十章:http://www.deeplearningbook.org/
Chris Olah:理解 LSTM:http://colah.github.io/posts/2015-08-Understanding-LSTMs/
已存在大量优秀的库可以帮助你基于LSTM构建机器学习应用。在GitHub中,谷歌的TensorFlow在此文成文时已有超过 50000 次星,表明了其在机器学习从业者中的流行度。
与此形成对比,相对缺乏的似乎是关于如何基于LSTM建立易于理解的TensorFlow应用的优秀文档和示例,这也是本文尝试解决的问题。
假设我们想用一个样本短故事来训练LSTM预测下一个单词,伊索寓言:
long ago , the mice had a general council to consider what measures they could take to outwit their common enemy , the cat . some said this , and some said that but at last a young mouse got up and said he had a proposal to make , which he thought would meet the case . you will all agree , said he , that our chief danger consists in the sly and treacherous manner in which the enemy approaches us . now , if we could receive some signal of her approach , we could easily escape from her . i venture , therefore , to propose that a small bell be procured , and attached by a ribbon round the neck of the cat . by this means we should always know when she was about , and could easily retire while she was in the neighbourhood . this proposal met with general applause , until an old mouse got up and said that is all very well , but who is to bell the cat ? the mice looked at one another and nobody spoke . then the old mouse said it is easy to propose impossible remedies .
表1.取自伊索寓言的短故事,其中有112个不同的符号。单词和标点符号都视作符号。
如果我们将文本中的3个符号以正确的序列输入LSTM,以1个标记了的符号作为输出,最终神经网络将学会正确地预测下一个符号(Figure1)。
图 1.有3个输入和1个输出的LSTM单元
严格说来,LSTM只能理解输入的实数。一种将符号转化为数字的方法是基于每个符号出现的频率为其分配一个对应的整数。例如,上面的短文中有112个不同的符号。如列表2所示的函数建立了一个有如下条目 [ “,” : 0 ] [ “the” : 1 ], …, [ “council” : 37 ],…,[ “spoke” = 111 ]的词典。而为了解码LSTM的输出,同时也生成了逆序字典。
build_dataset(words):
表 2.建立字典和逆序字典的函数
类似地,预测值也是一个唯一的整数值与逆序字典中预测符号的索引相对应。例如:如果预测值是37,预测符号便是“council”。
输出的生成看起来似乎简单,但实际上LSTM为下一个符号生成了一个含有112个元素的预测概率向量,并用softmax()函数归一化。有着最高概率值的元素的索引便是逆序字典中预测符号的索引值(例如:一个 one-hot 向量)。图2 给出了这个过程。
图2.每一个输入符号被分配给它的独一无二的整数值所替代。输出是一个表明了预测符号在反向词典中索引的 one-hot 向量。
LSTM模型是这个应用的核心部分。令人惊讶的是,它很易于用TensorFlow实现:
def RNN(x, weights, biases):
# reshape to [1, n_input]
x = tf.reshape(x, [-1, n_input])
# Generate a n_input-element sequence of inputs
# (eg. [had] [a] [general] → [20] [6] [33])
x = tf.split(x,n_input,1)
# 1-layer LSTM with n_hidden units.
rnn_cell = rnn.BasicLSTMCell(n_hidden)
# generate prediction
outputs, states = rnn.static_rnn(rnn_cell, x, dtype=tf.float32)
# there are n_input outputs but
# we only want the last output
return tf.matmul(outputs[-1], weights['out']) + biases['out']
表3.有512个LSTM 单元的网络模型
最难部分是以正确的格式和顺序完成输入。在这个例子中,LSTM的输入是一个有3个整数的序列(例如:1x3 的整数向量)
网络的常量、权值和偏差设置如下:
vocab_size = len(dictionary)
表4.常量和训练参数
训练过程中的每一步,3个符号都在训练数据中被检索。然后3个符号转化为整数以形成输入向量。
symbols_in_keys = [ [dictionary[ str(training_data[i])]] for i in range(offset, offset+n_input) ]
表 5.将符号转化为整数向量作为输入
训练标签是一个位于3个输入符号之后的 one-hot 向量
symbols_out_onehot = np.zeros([vocab_size], dtype=float)
表6.单向量作为标签
在转化为输入词典的格式后,进行如下的优化过程:
_, acc, loss, onehot_pred = session.run([optimizer, accuracy, cost, pred], feed_dict={x: symbols_in_keys, y: symbols_out_onehot})
表 7.训练过程中的优化
精度和损失被累积以监测训练过程。通常50,000次迭代足以达到可接受的精度要求。
...
表 8.一个训练间隔的预测和精度数据示例(间隔1000步)
代价是标签和softmax()预测之间的交叉熵,它被RMSProp以 0.001的学习率进行优化。在本文示例的情况中,RMSProp通常比Adam和SGD表现得更好。
pred = RNN(x, weights, biases)
表 9.损失和优化器
LSTM的精度可以通过增加层来改善。
rnn_cell = rnn.MultiRNNCell([rnn.BasicLSTMCell(n_hidden),rnn.BasicLSTMCell(n_hidden)])
Listing 10. 改善的LSTM
现在,到了有意思的部分。让我们通过将预测得到的输出作为输入中的下一个符号输入LSTM来生成一个故事吧。示例输入是“had a general”,LSTM给出了正确的输出预测“council”。然后“council”作为新的输入“a general council”的一部分输入神经网络得到下一个输出“to”,如此循环下去。令人惊讶的是,LSTM创作出了一个有一定含义的故事。
had a general council to consider what measures they could take to outwit their common enemy , the cat . some said this , and some said that but at last a young mouse got
表11.截取了样本故事生成的故事中的前32个预测值
如果我们输入另一个序列(例如:“mouse”, “mouse”, “mouse”)但并不一定是这个故事中的序列,那么会自动生成另一个故事。
mouse mouse mouse , neighbourhood and could receive a outwit always the neck of the cat . some said this , and some said that but at last a young mouse got up and said
表 12.并非来源于示例故事中的输入序列
示例代码可以在这里找到:https://github.com/roatienza/Deep-Learning-Experiments/blob/master/Experiments/Tensorflow/RNN/rnn_words.py
示例文本的链接在这里:https://github.com/roatienza/Deep-Learning-Experiments/blob/master/Experiments/Tensorflow/RNN/belling_the_cat.txt
小贴士:
用整数值编码符号容易操作但会丢失单词的意思。本文中将符号转化为整数值是用来简化关于用TensorFlow建立LSTM应用的讨论的。更推荐采用Word2Vec将符号编码为向量。
将输出表达成单向量是效率较低的方式,尤其当我们有一个现实的单词量大小时。牛津词典有超过170,000个单词,而上面的例子中只有112个单词。再次声明,本文中的示例只为了简化讨论。
这里采用的代码受到了Tensorflow-Examples的启发:https://github.com/aymericdamien/TensorFlow-Examples/blob/master/examples/3_NeuralNetworks/recurrent_network.py
本文例子中的输入大小为3,看一看当采用其它大小的输入时会发生什么吧(例如:4,5或更多)。
每次运行代码都可能生成不同的结果,LSTM的预测能力也会不同。这是由于精度依赖于初始参数的随机设定。训练次数越多(超过150,000次)精度也会相应提高。每次运行代码,建立的词典也会不同
Tensorboard在调试中,尤其当检查代码是否正确地建立了图时很有用。
试着用另一个故事测试LSTM,尤其是用另一种语言写的故事。
*请认真填写需求信息,我们会在24小时内与您取得联系。