
2009年5月19日
我是一个伪动漫爱好者,会一点点日语。当时发现了某Id有大量的日语小说扫描图,于是写一脚本把它们全部抓下来。一共25G,最后挑了7.xG。单线程下载。最近想完善blog图片的下载,发现163相册已经改了,脚本暂时无法使用。而那个牛Id也被删了,-_-b……
1 #!/usr/bin/python
2 # Filename: down163.py
3
4 import re
5 import urllib
6 import os
7 import os.path
8 class Dl163Photo:
9 def __init__(self):
10 self._photoDomain="http://img%SID%.photo.163.com"
11 self._globalExpress="var PHOTO_TYPES = \[\"(\w+)\", \"(\w+)\"\].+var PHOTO_SCALE_PREFIXES = \[\"(\w*)\", \"(\w+)\"\];.+var PHOTOSINFO_VALUE_INDEX = {\'sid\' : (\d), \'ext\' : (\d)"
12 self._userExpress="<script type=\"text\/javascript\" src=\"\/js\/photosinfo.php\?user=(\w+)&aid=(\d+)&from=\w+\"><\/script>"
13 self._photoExpress="gPhotosInfo\[(\d+)\] = \[(\d+),"
14 self._titleExpress="var gAlbumInfo = {.+\'title\':\"(.+) \",'descr'"
15 self._albumExpress="gAlbumsInfo\[(\d+)\] = "
16 def GetContent(self,url):
17 socket=urllib.urlopen(url)
18 self._content=socket.read()
19 return self._content
20
21 def Run(self):
22 url=self.Input()
23 r=Regex();
24 if r.Search("http://photo.163.com/photos/(\w+)/(\d+)",url):
25 self.ProcessOne(url)
26 else:
27 self.ProcessAll(url)
28
29
30
31 def ProcessAll(self,url):
32 userId=re.findall("http://photo.163.com/photos/(\w+)",url)[0]
33 url="%s/js/albumsinfo.php?user=%s&from=%s" % (self._root,userId,"guest")
34 r=Regex()
35 albums=r.FindAll(self._albumExpress,self.GetContent(url))
36 print "%d album(s) found in the %s user" % (len(albums),userId)
37 for item in albums:
38 self.ProcessOne("http://photo.163.com/photos/%s/%s/" % (userId,unicode(item,"raw-unicode-escape")))
39
40
41 def ProcessOne(self,url):
42 r=Regex()
43 infoMatchResult=r.FindAll("http://photo.163.com/photos/(\w+)/(\d+)",url)
44 userId=infoMatchResult[0][0]
45 aid=infoMatchResult[0][1]
46
47 jsMatchResult=r.FindAll(self._globalExpress,self.GetContent(self._globalJs))
48 photoTypes=jsMatchResult[0][:2]
49 photoScalePrefixs=jsMatchResult[0][2:4]
50 photoInfoIndex=jsMatchResult[0][4:6]
51 photoInfoUrl="%s/js/photosinfo.php?user=%s&aid=%s&from=%s" % (self._root,userId,aid,"guest")
52 photoInfo=self.GetContent(photoInfoUrl)
53 photoMatchResult=r.FindAll(self._photoExpress,photoInfo)
54 ablumMatchResult=r.FindAll(self._titleExpress,photoInfo)
55 albumTitle=unicode(ablumMatchResult[0],"raw-unicode-escape")
56 print "downloading %s's %s from 163 photo" % (userId,albumTitle)
57 photos=photoMatchResult
58 print "found %s pictures in the album that you want." % len(photos)
59 counter=0
60 r=Regex()
61 if not os.path.exists("./"+userId):
62 os.mkdir("./"+userId)
63 directory="./%s/%s" % (userId,albumTitle)
64 if not os.path.exists(directory):
65 os.mkdir(directory)
66 for item in photos:
67 picUrl="%s/%s/%s/%s.%s" % (r.Replace("%SID%",item[1],self._photoDomain),userId,aid,item[0],photoTypes[int(photoInfoIndex[1])-1])
68 filename="%s/%0.4d.%s" % (directory,counter,photoTypes[int(photoInfoIndex[1])-1])
69 self.Save(filename,self.GetContent(picUrl))
70 counter=counter+1
71 print "downloaded from %s progress:%d/%d" %(picUrl,counter,len(photos))
72
73
74 def Save(self,path,content):
75 file=open(path,"w+b")
76 file.write(content)
77 file.close()
78
79 def Input(self):
80 r=Regex()
81 msg="input the full path of 163 photo album's category homepage.\n such as \nhttp://photo.163.com/photos/ryvaius/61670282\nor:\nhttp://photo.163.com/photos/ryvaius/\n\nurl:\n"
82 regex1="http://photo.163.com/photos/(\w+)/(\d+)"
83 regex2="http://photo.163.com/photos/(\w+)\/?$"
84 url=raw_input(msg)
85 while r.Search(regex1,url)==None and r.Search(regex2,url)==None :
86 url=raw_input(msg)
87 self._root="http://"+url[7:len(url)].split('/')[0];
88 self._globalJs=self._root+"/js/global.consts.js"
89 print "parse the pages\n"
90 return url
91
92 class Regex:
93
94 def FindAll(self,express,content):
95 p=re.compile(express,re.DOTALL)
96 return p.findall(content)
97
98 def Replace(self,oldExpress,new,content):
99 p=re.compile(oldExpress)
100 return p.sub(new,content)
101
102 def Search(self,express,content):
103 p=re.compile(express)
104 return p.search(content)
105
106 Dl163Photo().Run()
posted @
2009-05-19 11:24 毒菇兄 阅读(50) |
评论 (0) |
编辑

2009年5月2日
.net Linq做得不错,很爽,尤其是Lambda表达式。其实.net 2.0的时代,早就有类似的做法,不过算是取巧。
但Java确实是不能了,因为Java到现在还认为重载操作符不安全,不能重载。Lambda表达式: list.Where(o=>o.???==????);而Nbear有ExpressionClip:Gateway.Default.From<Product>().Where(Product._.Id==10);Product._.Id==10返回其实是一个WhereClip
看一下代码,很简单。
1 using System;
2 using System.Collections.Generic;
3 using System.Linq;
4 using System.Text;
5 namespace optest {
6
7 class Program {
8 static void Main(string[] args) {
9 Cart cart = new Cart { Price = 100 };
10 Cart newCart = (cart - 10);//注意注意,返回的是一个Cart的实例
11 Console.WriteLine(cart);
12 Console.WriteLine(newCart);
13 Console.ReadLine();
14 }
15 }
16
17 public class Cart {
18 public int Price { get; set; }
19 public static Cart operator -(Cart cart, int price) {//我们也可以把-改为==
20 cart.Price -= price;
21 return cart;//注意返回
22 }
23 public static Cart operator +(Cart cart, int price) {
24 cart.Price += price;
25 return cart;
26 }
27 public override string ToString() {
28 return string.Format("Cart-----\r\nPrice:{0}", Price);
29 }
30 }
posted @
2009-05-02 23:26 毒菇兄 阅读(235) |
评论 (1) |
编辑
Discuz.nt!的模板技术似乎一直被别人称赞,不过当时也知道是重载了HttpModule。最近思考MVC的问题,发觉忽然想起DZ,于是简单看了一下微软的资料。如此简单!
MyHttpModule.cs
1 using System;
2 using System.Collections.Generic;
3 using System.Web;
4
5 namespace rewritetest {
6
7 public class MyHttpModule:System.Web.IHttpModule {
8
9 public void Init(System.Web.HttpApplication context) {
10 context.BeginRequest += BeginRequest;
11 context.EndRequest+=EndRequest;
12 }
13
14 public void Dispose() {
15 }
16
17 public void BeginRequest(object sender, EventArgs e) {
18 HttpContext context = (sender as HttpApplication).Context;
19 if (!string.IsNullOrEmpty(context.Request.QueryString["tpl"])) {
20 if (context.Request.QueryString["tpl"] == "1")
21 context.RewritePath("dir1/test.htm");
22 if (context.Request.QueryString["tpl"] == "2")
23 context.RewritePath("dir2/test.htm");
24 }
25 return;
26 }
27
28 public void EndRequest(object sender, EventArgs e) {
29 HttpContext context = (sender as HttpApplication).Context;
30 context.Response.Write("附加信息");
31 }
32 }
33 }
34
35
在web.config的system.web标签里加上
<httpModules>
<add name="HttpModule" type="rewritetest.MyHttpModule,rewritetest"/>
</httpModules>
dir1/test.htm dir2/test.htm自己胡乱写一个吧。从“附加信息”这几个字,也可以看出些东西了。因为我们能获取HttpContext,所以HttpModule能做的事情很多很多。包括注册,防盗链等等。在httpModule里加载,每个处理都会跑到那一部分,所以压力会大一点。
如果想有更多的了解,请移步
http://msdn.microsoft.com/zh-cn/library/system.web.httpapplication(VS.80).aspx
posted @
2009-05-02 22:54 毒菇兄 阅读(74) |
评论 (0) |
编辑