#!/usr/bin/python # -*- coding: UTF-8 -*- """Inspired by http://www.ducdigital.com/2009/11/24/massive-download-from-pixiv/ modified by Nandaka. http://nandaka.wordpress.com further modified by Kaens Bard http://kaen.su Works well in Python 2.6.x and 2.7.x Get and install Setup Tools: for Python 2.6.x: http://pypi.python.org/packages/2.6/s/setuptools/setuptools-0.6c11.win32-py2.6.exe Run cmd.exe, CHDIR to your python folder /Lib/site-packages and run easy_install.py beautifulsoup4 and easy_install.py http://wwwsearch.sourceforge.net/mechanize/src/mechanize-0.2.3.tar.gz Usage: set your account language to Japanese (for now their English translation seems shaky, will fix later and don't care about Chinese) EITHER: create a text file in UTF-8 w/o signature in scripts folder, list all the links you want mass-downloaded/updated there, each on its own line OR: just write the links directly in command line setup config.ini for startpage and quickcheck OR set quickcheck in command line (+q on, -q off) run the script with the file name (or many) as parameter(s) you can add links as parameters, they'll be processed The links accepted in the list file have been tested as working for: - user gallery pages - your own bookmark pages (make sure you've set up config.ini for your own profile for that) - tag searches - ranking pages - user stacc→user gallery (also accepts just the user ID as list name, checks for it are done after checks for list file name) - user profile → user gallery Additionally, it is possible to enumerate a user's or your own okiniiri list from the profile into a local file in list format. If you intend to use some Windows Notepad to create it, don't use unicode characters in links """ import re import os import sys import codecs import mechanize import urllib2 import time import random import calendar from datetime import date,datetime from mechanize import Browser,ProxyHandler from bs4 import BeautifulSoup import ConfigParser version = '2014-12-03' #-------Defaults url = 'http://www.pixiv.net/' proxyAddress = '' proxies = {} username = '' password = '' useragent = 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:16.0) Gecko/16.0 Firefox/16.0' debugHttp = False numberOfPage = 0 startPage = 1 quickCheck = True useRobots = False genericfilename_format = r"_%pageline%/%artist% (%artist_id%)/%image_id% - %title%" member_illust_format = r"(%artist_id%)/%image_id% - %title%" bookmark_new_illust_format = r"[お気に入りtracker]/%image_id% (%artist_id%) - %title%" response_format = r"%inurl_id% res/%image_id% (%artist_id%) - %title%" ranking_format = r"[%today%'s %inurl_id%'s best]/[#%num%] %image_id% (%artist_id%) - %title%" description_format = u"Title:%title%\\n%Tags:%tags%\\nCommentary:%remarks%" tokentag = r"" overwrite = False logFile = "" descript_ion = False description_file = u"descript.ion" blacklist = r"[腐 【腐 腐】 腐] 腐注 腐向 ※腐 BL注" #yaoi cut min_id = 0 #downto which id to longdownload, useful if you missed the 2000 latest okiniiri limit br = Browser() pr = None fQuickCheck = False curPage = 0 ugoira = False noOfImages = 0 debugid = 0 #-----------Start Download, input: image ID, type: VOID, save to disk def downloadImage(id): global br global fQuickCheck global curPage global ugoira global tokentag if quickCheck and fQuickCheck: return print "\nGetting file id: " + str(id) + "\t\t", tickity=0 #DEBUG while True: try: mediumPage = br.open(url+"member_illust.php?mode=medium&illust_id="+str(id),timeout=10) if logFile == "": print chr(8)+'`', else: print '`', ttt=mediumPage mediumPage = mediumPage.read() #time.sleep(random.random()*10+0.4) #i'm a human. human, not a robot. okay? right. if logFile == "": print chr(8)+'-', else: print '-', parseTitle = BeautifulSoup(mediumPage) if logFile == "": print chr(8)+'.', else: print '.', tickity += 1 if mediumPage.find(r"例外エラーが発生しました")>-1: print "...pixiv: unexpected error occurred, skipping..." raise elif mediumPage.find(r"該当作品は削除されたか、存在しない作品IDです。")>-1: print "...pixiv: submission taken down by artist" return elif mediumPage.find(r"マイピクにのみ公開されています")>-1: print "...pixiv: submission MyPixiv-only" return #DEBUG if tickity==20: print "-wrote tickity "+str(id)+" file-", lolfile=open('tickity '+str(id),'wb+') lolfile.write(parseTitle) lolfile.close() #/DEBUG break except: if id == debugid: print 1, else: print '.', tickity += 1 if tickity == 20: print "-wrote tickity "+str(id)+" file-", lolfile=open('tickity '+str(id),'wb+') lolfile.write(mediumPage) lolfile.close() time.sleep(5) anilookup = parseTitle.find('div',{'class':'_ugoku-illust-player-container'}) ugoira = anilookup != None if id == debugid: f=open("logofpages","wb+") f.write(parseTitle) f.close() #parse artist try: artist = parseTitle.find('div',{'class':'_unit profile-unit'}).h1.contents[0] print "artist: ",artist except UnicodeError: print "(not supported by console)" except LookupError,AttributeError: if parseTitle.find("エラーが発生しました".decode('utf8')): print "...denied by pixiv server, skipping" else: print "...Oops. Submission was taken down by the artist while downloading the rest OR another error occurred." f=open("submission %s down" % (id),"a+") f.write(mediumPage) f.close() return except: print "...artist parsing failed, SKIPPING" time.sleep(5) #parse commentary #print "Commentary:", #better not >_> try: artist_id = parseTitle.find('div',{'class':'_unit _work-detail-unit'}) .find('a',{'class':"tab-feed"})['href'] .split('/')[-1] except: print "artist_id not found:",parseTitle.find('div',{'class':'_unit _work-detail-unit'}) try: works_caption = parseTitle.find('p',{'class':'works_caption'}).getText(separator=u'\\n') except: works_caption=u'n/a' #parse tags, for blacklist as well tagsline=u'' #for descript.ion tagslist=[] #for log addtokentag=False #for the token tag try: for x in parseTitle.find('span',{'class':'tags-container'}).find_all(re.compile('a')): if not x.string: continue if x.string==u'*': tagsline+=x.string else: tagslist+=[x.string] tagsline+=x.string+u' ' if not addtokentag: addtokentag = (tokentag==x.string) except: print "...tag parsing failed, retrying" if tagsline==u'': tagsline=u'(n/a)' print "Tags:", for x in tagslist: try: print x, except UnicodeError: print "-", except LookupError: print "-", for i in blacklist.decode("utf8").split(" "): if (len(i) > 0) and (x.find(i) >= 0): try: print "blacklisted by %s, skipping..."%(i) except: print "blacklisted, skipping..." return print "" #parse imagedate and number of manga pages manga = False classmeta = parseTitle.find('ul',{'class':'meta'}) try: if classmeta != None: imagedate = classmeta.find(text=re.compile(r"(\d{4}年\d*月\d*日 \d{2}:\d{2}|\d*/\d*/\d{4} \d{2}:\d{2})".decode('utf8'))) else: print "WARNING: Either pixiv changed page format or page not found" z = re.search(r"(\d{4})年(\d*)月(\d*)日 (\d{2}):(\d{2})".decode('utf8'),imagedate) if z: # Japanese date imagedate = datetime(int(z.group(1)),int(z.group(2)),int(z.group(3)), int(z.group(4)),int(z.group(5))) else: imagedate = work_info.find('ul',{'class':'meta'}).find_all('li',text=re.compile(r"\d{2}/\d{2}/\d{2} \d{2}:\d{2}".decode('utf8')))[0].split("\n")[0] z = re.search(r"(\d*)/(\d*)/(\d{4}) (\d{2}):(\d{2})".decode('utf8'),imagedate) if z: # American date imagedate = datetime(int(z.group(3)),int(z.group(1)),int(z.group(2)), int(z.group(4)),int(z.group(5))) except Exception,e: print "(a)",str(e), print "(timestamp not found, assuming now/UTC)", imagedate = datetime.utcnow() if classmeta != None: mlookup = classmeta.find(text=re.compile(r"複数枚投稿 \d*P".decode('utf8'))) else: manga = False if ugoira: manga = False else: if mlookup != None: manga = True mangapages = int(mlookup.split(" ")[1][:-1]) else: manga = False imagedates = imagedate.strftime('%Y-%m-%d %H:%M') print "Date:",imagedates, #parse title try: # for a in parseTitle.find_all('h1',{'class':'title'}): print a.text title = parseTitle.find_all('h1',{'class':'title'})[2].text print "title:",title except UnicodeError: print "(not supported by console)" except LookupError: print "(unknown console encoding)" except: title = "untitled" #parse actual image(s) if ugoira: z = re.search(r"pixiv\.context\.ugokuIllustFullscreenData.+(http:.+ugoira1920x1080\.zip)",mediumPage) if z: anilink = z.group(1).replace('\\',"") else: print "Failed to find the Cinematic zip, skipping" exit works_display = parseTitle.find('div',{"class":"works_display"}) if not manga: tickity = 0 if not ugoira: while 1: try: viewPage = br.follow_link(url=works_display('a')[0]['href']) break except Exception,e: if id == debugid: print "(b)",str(e) else: print '.', tickity += 1 if tickity == 10: print "-wrote tickity "+str(id)+" file-", lolfile=open('tickity '+str(id),'wb+') lolfile.write(mediumPage) lolfile.close() if parseTitle.find("再度ログインしなおしてください".encode("utf8")): print "Attempting to re-login...", if login(username, password) == 0: print "success!" time.sleep(5) if manga: print "Getting manga,",mangapages,"pages..." imgList = [] for i in range(mangapages): imgList.append("member_illust.php?mode=manga_big&illust_id="+str(id)+"&page="+str(i)) elif ugoira: imgList = [anilink] else: #ergo, manga parser = BeautifulSoup(viewPage.read()) imgList = parser('img') for imgFile in imgList: #each imgFile becomes a big page link-to-follow in case of manga ._. if quickCheck and fQuickCheck: break if ugoira: ext = re.sub(r'http:.*/(\d+_ugoira.*\.zip)',r'\1',anilink) elif manga: while 1: try: viewPage = br.open(url+imgFile) parser = BeautifulSoup(viewPage.read()) imgFileM = parser('img')[0] ext = os.path.basename(imgFileM['src']) break except Exception,e: print "(c)",str(e), if str(e).find('global name')>-1: print "...skipping..." break if str(e).startswith('HTTP Error 404'): print "[404]The submission is rendered unloadable, skipping..." break if str(e).startswith('HTTP Error 400'): print "[400]The submission is rendered unloadable, skipping..." break time.sleep(5) else: #not manga or cine ext = os.path.basename(imgFile['src']) if ext.split('.')[0].startswith(str(id)): image_id = ext.split('.')[0] if manga: #for comfort browsing of manga stuff, zero-pad it to 2 digits: z = re.search(r'_p(\d*)',image_id) if z: image_id = re.sub(r'_p\d*','_p{0:02}'.format(int(z.group(1))),image_id) elif ugoira: image_id = re.sub(r'http://.*/(\d*)_ugoira.*',r'\1_ani',anilink) global _pager fileName = makeFilename(_pager, id, artist_id, artist, title, image_id, noOfImages, imagedates, addtokentag) fileName = fileName+"."+ext.split('.')[1].split('?')[0] print 'Saving to:', fileName = sanitizeFilename(fileName) try: print fileName except UnicodeError: print "(not supported by console)" except LookupError: print "(unknown console encoding)" if manga: dl(imgFileM['src'], fileName, viewPage.geturl(), imagedate) elif ugoira: dl(anilink, fileName, ttt.geturl(), imagedate) else: dl(imgFile['src'], fileName, viewPage.geturl(), imagedate) #descript.ion update tiem # assuming UTF-8 (so what if it doesn't work on most 2panel commanders, so what if it's not in the specs at http://jpsoft.com/ascii/descfile.txt? Who REALLY cares about this legacy? It just won't work otherwise, so there) # Using ^D the way Ghisler's Total Commadner 7.55a does--for description breaks, instead of what the specs say. # Likewise, EoF characters aren't specifically processed # Assuming available for writing # I hate everything about this implementation. if descript_ion: dfile = os.path.dirname(fileName)+'/'+description_file _descmagic='\x04\xC2\x0A' #writelines() will autoreplace \x0A with \x0D\x0A on w32 O_o if os.path.exists(dfile): descfile = open(dfile,'r') curdesc=descfile.readlines() descfile.close() else: curdesc=[] notyet=True for x in curdesc: if x.find(os.path.basename(fileName).encode('utf-8'))>-1: notyet=False break if notyet: print 'Updating descript.ion...' curdesc.append( makeDescription( os.path.basename(fileName), title, tagsline, works_caption ).encode('utf-8') + _descmagic ) descfile = open(dfile,'w') descfile.writelines(curdesc) descfile.close() curdesc=[] ttt.close() #-----------List all images #@profile def downloadAllImages(pager): global fQuickCheck global _pager _pager = pager print "Getting pages from ", pager fQuickCheck = False global startPage global curPage curPage = startPage hasMorePage = 1 global noOfImages noOfImages = 1 id = None previd = [0] weirdvar = 5 relogined = 0 while (hasMorePage != 0) and (weirdvar > 0): if quickCheck and fQuickCheck: break print "\nListing page #%d\t" % (curPage), while True: try: listPage = br.open(pager+"&p="+str(curPage),timeout=10) if logFile == "": print chr(8)+'`', else: print '`', ttt=listPage listPage=listPage.read() ttt.close() if logFile == "": print chr(8)+'-', else: print '-', #time.sleep(random.random()*10+2) #netiquette wwwhy is it here at all parseList = BeautifulSoup(listPage) if logFile == "": print chr(8)+'.', else: print '.', break except Exception, e: print "(d)",str(e) if str(e).find("403"): print "Attempting to re-login...", if login(username, password) == 0: print "success!" else: print '.', time.sleep(5) itsranking = False itsbookmarks = False illust_c = parseList.find('li',{'class':r'image-item'}) if illust_c == None: #bookmarks? illust_c = parseList.find('div', {'class':re.compile(r'^display_works')}) if illust_c == None: #if illust_c == None: # illust_c = parseList.find('ul', {'class':re.compile(r'image-items^')}) illust_c = parseList.find('div', {'class':re.compile('^ranking-items')}) if illust_c == None: #ranking? print "Unknown webpage design, ask the dev to support it" return else: itsranking = True print "Parsed as ranking" else: itsbookmarks = True print "Parsed as bookmarks" else: print "Parsed as generic" if illust_c != None: #found #if itsranking: illust = illust_c.find_all('a',href=re.compile(r'illust_id=\d*')) ##else: illust = parseList.find_all('a',{'class':re.compile('^work')}) for link in illust: if quickCheck and fQuickCheck: break try: id = link['href'].split('=')[2].split('&')[0].split('_')[0].split('?')[0] except IndexError: if link['href'].find('response.php'): continue print link['href'], 'has failed: unable to pick illust_id' if (id in previd) or (int(id) < 11): continue #skipping what's just in and incorrect id if int(id) < min_id: print "Lower id than minimum, stopping" break print "#"+ str(noOfImages)+':', downloadImage(id) previd.append(id) noOfImages = noOfImages + 1 hasMorePage = len(illust) curPage += 1 if numberOfPage == curPage: hasMorePage = 0 elif pager.find("bookmark_new_illust.php") and (curPage > 100): hasMorePage = 0 elif id != None: if int(id) < min_id: hasMorePage = 0 else: if relogined==0: print "\nNothing found on the page (div class * parsing error?), retrying.\n" weirdvar -= 1 time.sleep(5) if weirdvar==0: print "This is taking too long, attempting re-login..." configBrowser() if login(username, password) == 0: print "success!" relogined = 1 weirdvar = 5 else: print "That didn't help, skipping.\n" weirdvar = 0 print "Listing complete" previd = [0] re.purge() #-----------Download file def dl(url,filename,referer=None): dl(url,filename,referer,datetime.now()) def dl(url,filename,referer,imagedate): #circumventing some weirdass bug where we're downloading the same thing twice global prevname global fQuickCheck if quickCheck and fQuickCheck: return #url = re.sub(r'_p(\d*)\.',r'_big_p\1.',url) #add big_ to manga #OBSOLETED try: print "Downloading:", url, except LookupError: print "(unknown console encoding)" if os.path.exists(filename) and os.path.isfile(filename): if quickCheck: fQuickCheck = True print "\tFile exists, quick check-skipping the rest.\n" else: print "\tFile exists!\n" return print " " print "Trying to request ", req = urllib2.Request(url) if referer != None: req.add_header("Referer", referer) while True: try: res = br.open(req,timeout=10) break except Exception, e: if str(e).startswith('HTTP Error 404'): url = re.sub(r'_big_p(\d*)\.',r'_p\1.',url) #remove big_ from manga back >_> req = urllib2.Request(url) if referer != None: req.add_header("Referer", referer) print "\nURL change to",url while True: try: res = br.open(req,timeout=10) break except Exception, e: if str(e).startswith('HTTP Error 404'): print "Error 404 on fullsize, skipping the picture..." return else: print "(e)",str(e), print '.', time.sleep(5) else: print "(f)",str(e), print '.', time.sleep(5) dir = os.path.dirname(filename) if not os.path.exists(dir): try: print " Creating directory", dir, except UnicodeError: print "(not supported by console)", except LookupError: print "(unknown console encoding)", os.makedirs(dir) fretry = False fretrying = True save = open(filename, "w+b", 32768) while fretrying: try: prev = 0 if logFile == "": print '{0:10d} bytes'.format(prev), while 1: save.write(res.read(1024*256)) curr = save.tell() if logFile == "": print '\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b', print '{0:10d} bytes'.format(curr), if curr == prev: fretrying = False break prev = curr except IOError: if not fretry: fretry = True #save.seek(0,0) #save.truncate() save.flush() res = br.open(req,timeout=10) #print "and on...", else: print ".", time.sleep(5) save.close() if logFile != "": print '{0:10d} bytes'.format(curr), os.utime( filename, (calendar.timegm(imagedate.timetuple()), calendar.timegm(imagedate.timetuple())) ) print " done" #-------------produce an artist's okiniiri list.txt def dlOkiniiriList(pager): curpage=1 gotlist='' artistid='' print "Retrieving okiniiri list at:", pager, while True: req = urllib2.Request(pager+'&p=%d'%(curpage)) while True: try: res = br.open(req,timeout=10) break except Exception, e: if str(e).startswith('HTTP Error 404'): print "Not found, skipping." return else: print '.', time.sleep(5) while True: try: parseList = BeautifulSoup(res.read()) break except: time.sleep(5) print ',', if artistid=='': artistid = re.search( r'^http://i\d*\.pixiv\.net/img\d*/profile/([^/]*)/.*', parseList.find('div',{'class':'profile_area'}).find('img')['src'] ).group(1) print '\nUser='+artistid, 'Pages:', list_person = parseList.find_all('div',{'class':'usericon'}) for au in list_person: z = re.search( r'^http://i\d*\.pixiv\.net/img\d*/profile/([^/]*)/(mobile/)?.*\..*', au.find('img')['src'] ) if z: tehid = z.group(1) else: print "~", #extracting stacc name from artist's page q=urllib2.Request(url+au.find('a')['href']) try: a=br.open(q,timeout=10) p=BeautifulSoup(a.read()) p=p.find('div',{'class':'extaraNavi'}).find('a',href=re.compile(r'.*net/stacc/.*'))['href'] tehid=p.split('/')[-1] #print tehid, except: print "?", #failed to extract. bah, forget all this omfgletsfinditnomatterwhat shit, use full link tehid=url+'member_illust.php?id='+au.find('a')['href'].split('=')[1] gotlist+= tehid+'\x0A' if list_person: print curpage, curpage+=1 else: filename = 'Okiniiri of %s.txt'%(artistid) print "\nSaving the list as",filename open(filename,'w+').writelines(gotlist.encode('utf8')) return #-------------process list.txt def processList(filename): def ___commonpart___(pager): if re.search('^[a-zA-Z0-9-_]*$',pager): pager = "http://pixiv.me/"+pager if not re.search(r'^(http://)*([w\.]*)pixiv\.(net|cc)',pager): print "Not a pixiv address! Skipping..." else: pager = re.sub(r'member\.php',r'member_illust.php',pager) pager = re.sub(r'([?&]page=\d*|[?&]p=\d*|[?&]num=\d*)','',pager) if re.search(r'/stacc/',pager): print "Converting a stacc address...", try: ww = br.open(pager) www = BeautifulSoup(ww.read()) wwww = www.find('a',{'title':u'作品'}) pager = wwww['href'] print "success!" except Exception,e: print "(g)",str(e),": skipping..." return if pager == "http://www.pixiv.net/bookmark_new_illust.php": pager = "http://www.pixiv.net/bookmark_new_illust.php?mode=new" if re.search(r'bookmark\.php.*type=user',pager): dlOkiniiriList(pager) else: downloadAllImages(pager) if filename[:7]=='http://': print "Downloading from:", filename else: print "Processing list from:", filename if filename[:7]=='http://': ___commonpart___(filename) elif os.path.exists(filename) and os.path.isfile(filename): reader = open(filename,'r') for line in reader: pager = line.replace(chr(10),"").replace(chr(13),"") if pager.startswith('#'): continue ___commonpart___(pager) else: print "File not found." #-------------Sanitize filename (windows, but / counts as \) badchars= re.compile(r'['+chr(01)+'-'+chr(31)+']|^\.|\.$|^ | $|^$|\?|:|<|>|\||\*|\"') badnames= re.compile(r'(aux|com[1-9]|con|lpt[1-9]|prn)(\.|$)') def sanitizeFilename(s): name= badchars.sub('_', s) if badnames.match(name): name= '_'+name return name #------------Main Block def main(): try: prepare() global logFile if logFile != "": print "Logging output to "+logFile sys.stdout = codecs.open(logFile,'a+',encoding="utf-8-sig") global version print "Pixiv Mass Downloader ver." + version if logFile != "": print "By Duc Digital; Nandaka; Kaens Bard" global username if username == None or username == "": if logFile != "": print "Cannot log in from stdin while file-logging, terminating" else: username = raw_input("Username = ") else: print "Login as: "+username global password if password == None or password == "": if logFile != "": print "Cannot log in from stdin while file-logging, terminating" else: password = raw_input("Password = ") global numberOfPage if numberOfPage != 0: print "Page processing limit = ", numberOfPage global overwrite global quickCheck if overwrite: if not quickCheck: print "Overwrite mode" else: print "Overwrite mode--overridden by Quick check mode" overwrite = False if len(sys.argv) == 0: print "\nUTC", str(datetime.utcnow()), "Empty command line! Nothing to process." else: if login(username, password) == 0: for arg in sys.argv[1:]: if arg=='-q': print "\nQuick check mode now disabled" quickCheck = False elif arg=='+q': print "\nQuick check mode now enabled, overriding overwrite mode" quickCheck = True overwrite = False elif arg[:7]=='http://': processList(arg) elif os.path.exists(arg) and os.path.isfile(arg): processList(arg) else: if re.search('^[a-zA-Z0-9-_]*$',arg): print "Checking for a stacc address...", try: #ww = br.open(url+'stacc/'+arg) ww = br.open("http://pixiv.me/"+arg) www = BeautifulSoup(ww.read()) #open(arg+'-dump','w+').writelines(www.encode('utf8')) wwww = www.find("a",{'class':'tab-works'}) print 'success!' processList(url+wwww['href']) except Exception,e: if str(e).startswith('HTTP Error 404'): print 'failed, skipping...' else: print "(h)",str(e) if not quickCheck and (logFile==""): print "UTC", str(datetime.utcnow()), "All done! Press Enter to exit." raw_input() else: print "UTC", str(datetime.utcnow()), "Quick check complete." else: print "UTC", str(datetime.utcnow()), "Failed to log in." except KeyboardInterrupt as ex: print 'CTRL+C, aborted' #-------load config def loadConfig(): config = ConfigParser.RawConfigParser() try: config.read('pixivUtil.ini') print "Reading values for", global username print "username", username = config.get('Authentication','username') global password print "password", password = config.get('Authentication','password') global proxyAddress global proxies print "proxy_address", proxyAddress = config.get('Settings','proxy_address') if proxyAddress: try: proxies = {'http':proxyAddress} except: print "(couldn't parse proxy line)", proxies = {} global useragent print "user_agent", useragent = config.get('Settings','user_agent') global numberOfPage print "number_of_page", numberOfPage = config.getint('Pixiv','number_of_page') global startPage print "start_page", startPage = config.getint('Pixiv','start_page') global quickCheck print "quickcheck", quickCheck = config.getboolean('Pixiv','quickcheck') global genericfilename_format print "genericfilename_format", genericfilename_format = config.get('Pixiv','genericfilename_format').decode('utf8') global member_illust_format print "member_illust_format", member_illust_format = config.get('Pixiv','member_illust_format').decode('utf8') global bookmark_new_illust_format print "bookmark_new_illust_format", bookmark_new_illust_format = config.get('Pixiv','bookmark_new_illust_format').decode('utf8') global response_format print "response_format", response_format = config.get('Pixiv','response_format').decode('utf8') global ranking_format print "ranking_format", ranking_format = config.get('Pixiv','ranking_format').decode('utf8') global description_format print "description_format", description_format = config.get('Pixiv','description_format').decode('utf8') global tokentag print "tokentag", tokentag = config.get('Pixiv','tokentag').decode('utf8') global blacklist print "blacklist", blacklist = config.get('Pixiv','blacklist') global logFile print "logfile", logFile = config.get('Settings','logfile').decode('utf8') global descript_ion print "descript.ion", descript_ion = config.getboolean('Settings','descript.ion') global description_file print "descript.ion_file", descript_ion = config.get('Settings','descript.ion_file').decode('utf8') global debugHttp print "debug_http", debugHttp = config.getboolean('Settings','debug_http') global useRobots print "use_robots." useRobots = config.getboolean('Settings','use_robots') except ConfigParser.NoOptionError: print "Required option not found in config, writing defaults..." writeConfig() exit() except ConfigParser.NoSectionError: print "Required section not found in config, writing defaults..." writeConfig() exit() #-------write config def writeConfig(): print "Writing defaults is temporarily disabled, please add the missing option/section manually." return config = ConfigParser.RawConfigParser() config.add_section('Settings') config.add_section('Pixiv') config.add_section('Authentication') config.set('Authentication', 'username', username) config.set('Authentication', 'password', password) config.set('Pixiv', 'number_of_page', numberOfPage) config.set('Pixiv', 'start_page', startPage) config.set('Pixiv', 'quickcheck', quickCheck) config.set('Pixiv', 'blacklist', blacklist) config.set('Pixiv', 'genericfilename_format', genericfilename_format) config.set('Pixiv', 'member_illust_format', member_illust_format) config.set('Pixiv', 'bookmark_new_illust_format', bookmark_new_illust_format) config.set('Pixiv', 'response_format', response_format) config.set('Pixiv', 'ranking_format', ranking_format) config.set('Pixiv', 'descript.ion_format',description_format) config.set('Pixiv', 'tokentag',tokentag) config.set('Settings', 'proxy_address', proxyAddress) config.set('Settings', 'user_agent', useragent) config.set('Settings', 'debug_http', debugHttp) config.set('Settings', 'use_robots', useRobots) config.set('Settings', 'logfile', logFile) config.set('Settings','descript.ion',descript_ion) config.set('Settings','descript.ion_file',description_file) with open('pixivUtil.ini', 'wb') as configfile: #utf-8 dammit config.write(configfile) print "Configuration file saved." #-------construct the filename def makeFilename(pageline, member_id, artist_id, artist, title, image_id, num, imgdate, addtokentag): global tokentag image_id = str(image_id) if (tokentag != "") and addtokentag: image_id = image_id+r"["+tokentag+r"]" inurl_id = ' ' q = urllib2.unquote(pageline).decode('utf8').split('/')[-1] #specific format checks z = re.search(r'member_illust\.php\?id=(\d*)',q) if z: inurl_id = z.group(1) nameformat = member_illust_format else: z = re.search(r'bookmark_new_illust\.php',q) if z: nameformat = bookmark_new_illust_format else: z = re.search(r'response\.php\?illust_id=(\d*)',q) if z: inurl_id = z.group(1) nameformat = response_format else: z = re.search(r'ranking.*(mode=([a-z0-9]*)|rookie)',q) if z: inurl_id = z.group(1) nameformat = ranking_format else: nameformat = genericfilename_format nameformat = nameformat.replace('%pageline%',q.replace(u'?',u'?'))\ .replace('%artist%',artist.replace(u'\\',u'_').replace(u'/',u'_'))\ .replace('%title%',title.replace(u'\\',u'_').replace(u'/',u'_'))\ .replace('%image_id%',image_id).replace('_big','')\ .replace('%member_id%',str(member_id))\ .replace('%artist_id%',artist_id)\ .replace('%inurl_id%',inurl_id)\ .replace('%today%',str(date.today()))\ .replace('%date%',imgdate.replace(u'\\',u'-')).replace(u'//',u'-')\ .replace('%num%','{0:03d}'.format(num)) #leaving at 3 because sorting on it is only important for ranking, and that's 500 max return nameformat #-------Construct the line of descript.ion def makeDescription(file, title, tags, remarks): _file = file if _file.find(' ')>-1: _file = '"'+_file+'"' _file+=' ' return _file + description_format\ .replace(u'%title%',title)\ .replace(u'%tags%',tags)\ .replace(u'%remarks%',remarks) #-------Configure browser object def configBrowser(): global br global pr global proxies if proxies: pr = ProxyHandler(proxies) else: pr = ProxyHandler() br.set_handle_equiv(True) #br.set_handle_gzip(True) br.set_handle_redirect(True) br.set_handle_referer(True) global useRobots br.set_handle_robots(useRobots) global debugHttp br.set_debug_http(debugHttp) br.visit_response global useragent br.addheaders = [('User-agent', useragent)] #-------Login to pixiv def login(username, password): print "Login at %s\t\t" % (url), req = urllib2.Request(url+'login.php') while True: try: response = br.open(req,timeout=10) break except Exception,e: print "(i)",str(e),':: sleeping for 5 sec' time.sleep(5) rd = response.read() global debugHttp if debugHttp: print rd try: form = br.select_form(nr=1) br['pixiv_id'] = username br['pass'] = password response = br.submit() # LOGIN except Exception,e: print "(j)",str(e),"(assuming relogin)" #print "(login form not found, assuming relogin)", lolfile=open('loginness.log','wb+') lolfile.write(rd) lolfile.close() if response.geturl() == 'http://www.pixiv.net/mypage.php': print "DONE!" return 0 else : print 'Wrong username or password' lolfile=open('loginness.log','wb+') lolfile.write(response.read()) lolfile.close() return 1 def prepare(): loadConfig() configBrowser() def printConfig(): print "Username :",username print "Password :",password print "Proxy Addr:",proxyAddress if __name__ == "__main__": main()