python编写网页爬虫脚本并实现APScheduler调度(2)
msg = MIMEText(context) #文本邮件的内容
msg['Subject'] = sub #主题
msg['From'] = "my@vmail.cn" #发信人
msg['To'] = COMMASPACE.join(mailto_list) #收信人列表
def send_mail(mailto_list, sub, context): COMMASPACE = ',' mail_host = "localhost" me = "my@vmail.cn" # Create a text/plain message msg = MIMEText(context) msg['Subject'] = sub msg['From'] = "my@vmail.cn" msg['To'] = COMMASPACE.join(mailto_list) send_smtp = smtplib.SMTP(mail_host) send_smtp.sendmail(me, mailto_list, msg.as_string()) send_smtp.close()
应用文档:http://docs.python.org/2/library/email.html?highlight=smtplib#
4.Python调度框架ApScheduler
下载地址https://pypi.python.org/pypi/APScheduler/2.1.0
官方文档:http://pythonhosted.org/APScheduler/#faq
API:http://pythonhosted.org/APScheduler/genindex.html
安装方法:下载之后解压缩,然后执行python setup.py install,导入模块
from apscheduler.scheduler import Scheduler
ApScheduler配置比较简单,本例中只用到了add_interval_job方法,在每间隔一段时间后执行任务脚本,本例中的间隔是30分钟。可参考实例文章http://flykite.blog.51cto.com/4721239/832036
# Start the scheduler sched = Scheduler() sched.daemonic = False sched.add_interval_job(job,minutes=30) sched.start()
关于daemonic参数:
apscheduler会创建一个线程,这个线程默认是daemon=True,也就是默认的是线程守护的。
在上面的代码里面,要是不加上sched.daemonic=False的话,这个脚本就不会按时间运行。
因为脚本要是没有sched.daemonic=False,它会创建一个守护线程。这个过程中,会创建scheduler的实例。但是由于脚本运行速度很快,主线程mainthread会马上结束,而此时定时任务的线程还没来得及执行,就跟随主线程结束而结束了。(守护线程和主线程之间的关系决定的)。要让脚本运行正常,必须设置该脚本为非守护线程。sched.daemonic=False
附:全部脚本代码
All Code
#-*- coding: UTF-8 -*-
import urllib2
from sgmllib import SGMLParser
import pymongo
import time
# Import smtplib for the actual sending function
import smtplib
from email.mime.text import MIMEText
from apscheduler.scheduler import Scheduler
#get freebook hrefs
class ListHref(SGMLParser):
def __init__(self):
SGMLParser.__init__(self)
self.is_a = ""
self.name = []
self.freehref=""
self.hrefs=[]
def start_a(self, attrs):
self.is_a = 1
href = [v for k, v in attrs if k == "href"]
self.freehref=href[0]
def end_a(self):
self.is_a = ""
def handle_data(self, text):
if self.is_a == 1 and text.decode('utf8').encode('gbk')=="限时免费":
self.hrefs.append(self.freehref)
#get freebook Info
class FreeBook(SGMLParser):
def __init__(self):
SGMLParser.__init__(self)
self.is_title=""
self.name = ""
def start_title(self, attrs):
self.is_title = 1
def end_title(self):
self.is_title = ""
def handle_data(self, text):
if self.is_title == 1:
self.name=text
#Mongo Store Module
class freeBookMod:
def __init__(self, date, bookname ,href):
self.date=date
self.bookname=bookname
self.href=href
def get_book(bookList):
content = urllib2.urlopen('http://sale.jd.com/act/yufbrhZtjx6JTV.html').read()
listhref = ListHref()
listhref.feed(content)
for href in listhref.hrefs:
content = urllib2.urlopen(str(href)).read()
listbook=FreeBook()
listbook.feed(content)
name = listbook.name
n= name.index('》')
#print (name[0:n+2])
freebook=freeBookMod(time.strftime('%Y-%m-%d',time.localtime(time.time())),name[0:n+2],href)
bookList.append(freebook)
return bookList
def record_book(bookList,context,isSendMail):
# DataBase Operation
mongoCon=pymongo.Connection(host="127.0.0.1",port=27017)
db= mongoCon.mydatabase
for bookItem in bookList:
bookInfo = db.book.find_one({"href":bookItem.href})
if not bookInfo:
b={
"bookname":bookItem.bookname.decode('gbk').encode('utf8'),
"href":bookItem.href,
"date":bookItem.date
}
db.book.insert(b,safe=True)
isSendMail=True
context=context+bookItem.bookname.decode('gbk').encode('utf8')+','
return context,isSendMail
#Send Message
def send_mail(mailto_list, sub, context):
COMMASPACE = ','
mail_host = "localhost"
me = "my@vmail.cn"
# Create a text/plain message
msg = MIMEText(context)
msg['Subject'] = sub
msg['From'] = "my@vmail.cn"
msg['To'] = COMMASPACE.join(mailto_list)
send_smtp = smtplib.SMTP(mail_host)
send_smtp.sendmail(me, mailto_list, msg.as_string())
send_smtp.close()
#Main job for scheduler
def job():
bookList=[]
isSendMail=False;
context="Today free books are"
mailto_list=["mailto@mail.cn"]
bookList=get_book(bookList)
context,isSendMail=record_book(bookList,context,isSendMail)
if isSendMail==True:
send_mail(mailto_list,"Free Book is Update",context)
if __name__=="__main__":
# Start the scheduler
sched = Scheduler()
sched.daemonic = False
sched.add_interval_job(job,minutes=30)
sched.start()






