python写简单爬虫的五种方法

【用Python写爬虫】获取html的方法【一】:使用urllib

# -*- coding: UTF-8 -*-
import urllib

def getWebPageContent(url):
    f = urllib.urlopen(url)
    data = f.read()
    f.close()
    return data

url = 'http://blog.csdn.net'
content = getWebPageContent(url)
print content

【用Python写爬虫】获取html的方法【二】:使用pycurl

# Pycurl参考地址:http://pycurl.sourceforge.net/
# Pycurl下载地址:http://pycurl.sourceforge.net/download/pycurl-7.18.1.tar.gz

# -*-coding: UTF-8 -*-
importpycurl
importStringIO

def getURLContent_pycurl(url):
    c = pycurl.Curl()
    c.setopt(pycurl.URL,url)
    b = StringIO.StringIO()
    c.setopt(pycurl.WRITEFUNCTION, b.write)
    c.setopt(pycurl.FOLLOWLOCATION, 1)
    c.setopt(pycurl.MAXREDIRS, 5)
    # 代理
    #c.setopt(pycurl.PROXY, 'http://11.11.11.11:8080')
    #c.setopt(pycurl.PROXYUSERPWD, 'aaa:aaa')
    c.perform()
    return b.getvalue()
 
url = 'http://blog.csdn.net'
content =getURLContent_pycurl(url)
print content

【用Python写爬虫】获取html的方法【三】:使用cPAMIE
# cPAMIE下载:http://sourceforge.net/project/showfiles.php?group_id=103662

# -*-coding: UTF-8 -*-
import cPAMIE
def getURLContent_cPAMIE(url):
        g_ie =cPAMIE.PAMIE()
        g_ie.showDebugging = False
        g_ie.frameName= None
        g_ie.navigate(url)
 content =g_ie.pageGetText()
 g_ie.quit()    
 return content

url = 'http://blog.csdn.net'
content = getURLContent_cPAMIE(url)
print content


【用Python写爬虫】获取html的方法【四】:使用urllib下载文件

# -*- coding: UTF-8 -*-
import urllib
url = 'http://blog.csdn.net'
path = 'C:\\temp\\csdn.net.html'
urllib.urlretrieve(url,path)


【用Python写爬虫】获取html的方法【五】:利用Twisted框架之client.getPage
# Twisted框架下载:
http://twistedmatrix.com/trac/wiki/Downloads

# -*-coding: UTF-8 -*-
from twisted.internet import reactor
from twisted.web import client

def result(content):
    print content
    reactor.stop()

 deferred =client.getPage("http://blog.csdn.net")
 deferred.addCallback(result)
 reactor.run()

Django环境配置 python2.5+Apache+mod_python

一、环境安装

        1、安装python2.5(win32)

                http://www.python.org/download/

        2、安装Apache2.2

                http://apache.freelamp.com/httpd/binaries/win32/httpd-2.2.15-win32-x86-openssl-0.9.8m-r2.msi

        3、安装mod_python3.31

              http://www.apache.org/dist/httpd/modpython/win/3.3.1/mod_python-3.3.1.win32-py2.5-Apache2.2.exe

二、配置Apache

        1、配置Apache配置文件,httpd.conf:

 

Listen 127.0.0.1:80

# 记得要加载mod_python.so
LoadModule python_module modules/mod_python.so


# 配置静态文件权限,让apache有权访问 设置时,请注意“\”和“/”区别
<Directory "C:/Python25/Lib/site-packages/django/contrib/admin/media">
    AllowOverride None
    Options None
    Order allow,deny
    Allow from all
</Directory>

<Directory "C:/djangoapp/mysite/themes">
	Options Indexes FollowSymLinks
	AllowOverride None
	Order allow,deny
	Allow from all
</Directory>

<VirtualHost 127.0.0.1:80>
# mysite目录路径: c:/djangoapp,
# 但是对于PythonPath,必须设置成这个目录的上一级目录!
# this site url:http://127.0.0.1:80/mysite/
    <Location "/mysite">
        SetHandler python-program
        PythonPath "['C:/djangoapp'] + sys.path"
        PythonHandler django.core.handlers.modpython
        SetEnv DJANGO_SETTINGS_MODULE mysite.settings
        PythonAutoReload On
        PythonOption django.root /mysite
        PythonDebug On
    </Location>
    
# Alias /site_media 是用来将 myproject的静态文件设置一个 URL 访问的别名。

#Alias /site_media c:/django/myproject/media
#    <Location "/site_media/">
#        SetHandler None
#    </Location>

# 设置themes为别名 对应到C:\djangoapp\mysite\themes目录
Alias /themes C:\djangoapp\mysite\themes
    <Location "/themes/">
        SetHandler None
    </Location>

# Alias /media 是将 Django Admin 的静态文件设置一个 URL 的访问别名。 
Alias /media C:\Python25\Lib\site-packages\django\contrib\admin\media
    <Location "/media/">
        SetHandler None
    </Location>

# file types we want to serve statically
# case insensative match
    <LocationMatch "(?i)\.(jpg|gif|png|txt|ico|pdf|css|jpeg)$">
        SetHandler None
    </LocationMatch>
</VirtualHost>

#设置不缓存,这样修改python源代码以后,可以自己加载,正式发布的时候,这个要去掉
MaxRequestsPerChild 1

        以上是我的conf参数,仅供参考

        2、检查是否配置成功,修改urls.py:

from django.conf.urls.defaults import *
import os
# Uncomment the next two lines to enable the admin:
from django.contrib import admin
admin.autodiscover()

urlpatterns = patterns('',
    # Example:
    # (r'^mysite/', include('mysite.foo.urls')),

    # Uncomment the admin/doc line below and add 'django.contrib.admindocs' 
    # to INSTALLED_APPS to enable admin documentation:
    # (r'^admin/doc/', include('django.contrib.admindocs.urls')),
    
    # Uncomment the next line to enable the admin:
    (r'^', include('mysite.blog.urls')),
    (r'^admin/', include(admin.site.urls)),
    (r'^blog/', include('mysite.blog.urls')),
    (r'^themes/(?P<path>.*)$', 'django.views.static.serve',
        {'document_root': os.path.dirname(os.path.abspath(__file__)) + '/themes/'}),     
)

Ok ,输入http://127.0.0.1/mysite/ 即可