【用Python写爬虫】获取html的方法【一】:使用urllib
# -*- coding: UTF-8 -*-
import urllib
def getWebPageContent(url):
f = urllib.urlopen(url)
data = f.read()
f.close()
return data
url = 'http://blog.csdn.net'
content = getWebPageContent(url)
print content
【用Python写爬虫】获取html的方法【二】:使用pycurl
# Pycurl参考地址:http://pycurl.sourceforge.net/
# Pycurl下载地址:http://pycurl.sourceforge.net/download/pycurl-7.18.1.tar.gz
# -*-coding: UTF-8 -*-
importpycurl
importStringIO
def getURLContent_pycurl(url):
c = pycurl.Curl()
c.setopt(pycurl.URL,url)
b = StringIO.StringIO()
c.setopt(pycurl.WRITEFUNCTION, b.write)
c.setopt(pycurl.FOLLOWLOCATION, 1)
c.setopt(pycurl.MAXREDIRS, 5)
# 代理
#c.setopt(pycurl.PROXY, 'http://11.11.11.11:8080')
#c.setopt(pycurl.PROXYUSERPWD, 'aaa:aaa')
c.perform()
return b.getvalue()
url = 'http://blog.csdn.net'
content =getURLContent_pycurl(url)
print content
【用Python写爬虫】获取html的方法【三】:使用cPAMIE
# cPAMIE下载:http://sourceforge.net/project/showfiles.php?group_id=103662
# -*-coding: UTF-8 -*-
import cPAMIE
def getURLContent_cPAMIE(url):
g_ie =cPAMIE.PAMIE()
g_ie.showDebugging = False
g_ie.frameName= None
g_ie.navigate(url)
content =g_ie.pageGetText()
g_ie.quit()
return content
url = 'http://blog.csdn.net'
content = getURLContent_cPAMIE(url)
print content
【用Python写爬虫】获取html的方法【四】:使用urllib下载文件
# -*- coding: UTF-8 -*- import urllib url = 'http://blog.csdn.net' path = 'C:\\temp\\csdn.net.html' urllib.urlretrieve(url,path)
【用Python写爬虫】获取html的方法【五】:利用Twisted框架之client.getPage
# Twisted框架下载:
http://twistedmatrix.com/trac/wiki/Downloads
# -*-coding: UTF-8 -*-
from twisted.internet import reactor
from twisted.web import client
def result(content):
print content
reactor.stop()
deferred =client.getPage("http://blog.csdn.net")
deferred.addCallback(result)
reactor.run()
1、安装python2.5(win32)
http://www.python.org/download/
2、安装Apache2.2
http://apache.freelamp.com/httpd/binaries/win32/httpd-2.2.15-win32-x86-openssl-0.9.8m-r2.msi
3、安装mod_python3.31
http://www.apache.org/dist/httpd/modpython/win/3.3.1/mod_python-3.3.1.win32-py2.5-Apache2.2.exe
1、配置Apache配置文件,httpd.conf:
Listen 127.0.0.1:80
# 记得要加载mod_python.so
LoadModule python_module modules/mod_python.so
# 配置静态文件权限,让apache有权访问 设置时,请注意“\”和“/”区别
<Directory "C:/Python25/Lib/site-packages/django/contrib/admin/media">
AllowOverride None
Options None
Order allow,deny
Allow from all
</Directory>
<Directory "C:/djangoapp/mysite/themes">
Options Indexes FollowSymLinks
AllowOverride None
Order allow,deny
Allow from all
</Directory>
<VirtualHost 127.0.0.1:80>
# mysite目录路径: c:/djangoapp,
# 但是对于PythonPath,必须设置成这个目录的上一级目录!
# this site url:http://127.0.0.1:80/mysite/
<Location "/mysite">
SetHandler python-program
PythonPath "['C:/djangoapp'] + sys.path"
PythonHandler django.core.handlers.modpython
SetEnv DJANGO_SETTINGS_MODULE mysite.settings
PythonAutoReload On
PythonOption django.root /mysite
PythonDebug On
</Location>
# Alias /site_media 是用来将 myproject的静态文件设置一个 URL 访问的别名。
#Alias /site_media c:/django/myproject/media
# <Location "/site_media/">
# SetHandler None
# </Location>
# 设置themes为别名 对应到C:\djangoapp\mysite\themes目录
Alias /themes C:\djangoapp\mysite\themes
<Location "/themes/">
SetHandler None
</Location>
# Alias /media 是将 Django Admin 的静态文件设置一个 URL 的访问别名。
Alias /media C:\Python25\Lib\site-packages\django\contrib\admin\media
<Location "/media/">
SetHandler None
</Location>
# file types we want to serve statically
# case insensative match
<LocationMatch "(?i)\.(jpg|gif|png|txt|ico|pdf|css|jpeg)$">
SetHandler None
</LocationMatch>
</VirtualHost>
#设置不缓存,这样修改python源代码以后,可以自己加载,正式发布的时候,这个要去掉
MaxRequestsPerChild 1
以上是我的conf参数,仅供参考
2、检查是否配置成功,修改urls.py:
from django.conf.urls.defaults import *
import os
# Uncomment the next two lines to enable the admin:
from django.contrib import admin
admin.autodiscover()
urlpatterns = patterns('',
# Example:
# (r'^mysite/', include('mysite.foo.urls')),
# Uncomment the admin/doc line below and add 'django.contrib.admindocs'
# to INSTALLED_APPS to enable admin documentation:
# (r'^admin/doc/', include('django.contrib.admindocs.urls')),
# Uncomment the next line to enable the admin:
(r'^', include('mysite.blog.urls')),
(r'^admin/', include(admin.site.urls)),
(r'^blog/', include('mysite.blog.urls')),
(r'^themes/(?P<path>.*)$', 'django.views.static.serve',
{'document_root': os.path.dirname(os.path.abspath(__file__)) + '/themes/'}),
)
Ok ,输入http://127.0.0.1/mysite/ 即可