--- /dev/null
+# coding=utf-8
+from paste.script.command import Command
+from fc.lib.base import *
+from fc.model import *
+from sqlalchemy.orm import eagerload
+from sqlalchemy.orm import class_mapper
+from sqlalchemy.sql import and_, or_, not_
+import sqlalchemy
+import os
+import cgi
+import shutil
+import datetime
+import time
+import Image
+import hashlib
+import re
+from fc.lib.fuser import FUser
+from fc.lib.miscUtils import *
+from fc.lib.constantValues import *
+from fc.lib.settings import *
+from fc.lib.fileHolder import AngryFileHolder
+import sys
+import paste.fixture
+import paste.registry
+import paste.deploy.config
+from paste.deploy import loadapp, appconfig
+from paste.script.command import Command, BadCommand
+from paste.script.filemaker import FileOp
+from paste.script.pluginlib import find_egg_info_dir
+import urllib2
+import httplib
+from lxml import etree
+import StringIO
+from fc.model.arch import *
+import logging
+
+def can_import(name):
+ """Attempt to __import__ the specified package/module, returning True when
+ succeeding, otherwise False"""
+ try:
+ __import__(name)
+ return True
+ except ImportError:
+ return False
+
+def unicodify(text):
+ if isinstance(text, str):
+ text = text.decode('utf-8')
+ return text
+
+idList = {}
+GFilters = {}
+
+class DateTimeParser:
+ monthes = [('Янв','Jan','января'),('Фев','Feb','февраля'),('Мар','Mar','марта'),('Апр','Apr','апреля'),('Май','May','мая'),('Июн','Jun','июня'),('Июл','Jul','июля'),('Авг','Aug','августа'),('Сен','Sep','сентября'),('Окт','Oct','октября'),('Ноя','Nov','ноября'),('Дек','Dec','декабря')]
+ dateRe = re.compile(r"""[^\d]+(\d+)\s+([^\d\s]+)\s+(\d+)\s+(\d+)\:(\d+)\:(\d+)""")
+ dateReISO = re.compile(r"""(\d+)\-(\d+)\-(\d+) (\d+)\:(\d+)\:(\d+)""")
+ def getDateTime(self,date):
+ dateP = self.dateRe.findall(date)
+ dateP = dateP[0]
+ mi = 0
+ f = False
+ for mm in self.monthes:
+ mi = mi + 1
+ if dateP[1] in mm:
+ f = True
+ break
+ if f:
+ return datetime.datetime(int(dateP[2]),mi,int(dateP[0]),int(dateP[3]),int(dateP[4]),int(dateP[5]))
+ else:
+ return None
+ def getDateTimeFromISO8601(self,date):
+ dateP = self.dateReISO.findall(date)
+ dateP = dateP[0]
+ return datetime.datetime(int(dateP[0]),int(dateP[1]),int(dateP[2]),int(dateP[3]),int(dateP[4]),int(dateP[5]))
+
+DTP = DateTimeParser()
+
+class IBParser:
+ def GetNextTag(self,el,tag,skip=0):
+ tag = tag.lower()
+ if skip:
+ r = el.getnext()
+ else:
+ r = el
+ if not r.tag or r.tag.lower() != tag:
+ while (r.getnext() != None) and not (r.getnext().tag and r.getnext().tag.lower() == tag):
+ r = r.getnext()
+ if r.getnext() != None:
+ r = r.getnext()
+ if r.tag and r.tag.lower() == tag:
+ return r
+ else:
+ return None
+
+ def GetPreviousTag(self,el,tag,skip=0):
+ tag = tag.lower()
+ if skip:
+ r = el.getprevious()
+ else:
+ r = el
+ if not r.tag or r.tag.lower() != tag:
+ while (r.getprevious() != None) and not (r.getprevious().tag and r.getprevious().tag.lower() == tag):
+ r = r.getprevious()
+ if r.getprevious() != None:
+ r = r.getprevious()
+ if r.tag and r.tag.lower() == tag:
+ return r
+ else:
+ return None
+ def ResolveSecondaryId(self,thread,Ids):
+ id = int(Ids[1])
+ if id in idList:
+ return idList[id][0]
+
+ tagsf = and_(Post.tags.any(tag=thread.chanTag),Post.tags.any(tag=thread.board))
+ f2 = and_(Post.parentid==-1,tagsf)
+ f1 = and_(Post.secondaryIndex==Ids[0],f2)
+ thread = meta.Session.query(Post).filter(f1).first()
+ if thread:
+ if Ids[0] == Ids[1]:
+ return thread.id
+ else:
+ post = meta.Session.query(Post).filter(and_(Post.secondaryIndex==int(Ids[1]),Post.parentid==thread.id)).first()
+ if post:
+ return post.id
+ else:
+ return None
+ else:
+ return None
+ def GetPostID(self,post):
+ if post.thread:
+ ids = self.replyIdRe.findall(post.href)
+ return [post.thread.tid,int(ids[0])]
+ else:
+ ids = self.postIdRe.findall(post.href)
+ return [int(ids[0][0]),ids[0][2] and int(ids[0][2]) or int(ids[0][0])]
+
+class Loader:
+ def parseLink(self,link):
+ s1 = link.split('://')
+ p = len(s1)>1 and s1[0] or None
+ p2= p and (p+'://') or ''
+ s2 = s1[-1].split('/')
+ return [p, s2[0], p2 + s2[0] + '/', p2 + '/'.join(s2[:-1]) + '/', s2[-1],'/'+'/'.join(s2[1:])]
+
+class LoaderLocal(Loader):
+ def __init__(self,link):
+ p = self.parseLink(link)
+ self.relativeUrl = p[3]
+ def stat(self,link):
+ try:
+ stats = os.stat(link)
+ return [datetime.datetime.fromtimestamp(stats[8]),stats[6]]
+ except OSError:
+ return None
+ def get(self,url):
+ return open(url,'rb').read()
+ def getAbsolutePath(self,url):
+ return self.relativeUrl + url
+ def getFromRelative(self,url):
+ return self.get(self.getAbsolutePath(url))
+
+class LoaderHTTP(Loader):
+ def __init__(self,link):
+ p = self.parseLink(link)
+ self.proto = p[0]
+ self.host = p[1]
+ self.baseUrl = p[2]
+ self.relativeUrl = p[3]
+ def stat(self,link):
+ linkp = self.parseLink(link)
+ c = httplib.HTTPConnection(linkp[1])
+ c.request('HEAD', linkp[5])
+ r = c.getresponse()
+ if r.status == 200:
+ size = r.getheader('content-length',0)
+ date = r.getheader('last-modified',r.getheader('date',None))
+ return [DTP.getDateTime(date),size]
+ elif r.status == 404:
+ return None
+ else:
+ return None
+ def get(self,url):
+ req = urllib2.Request(url)
+ req.add_header('Referer', self.baseUrl)
+ try:
+ f = urllib2.urlopen(req)
+ res = f.read()
+ return res
+ except urllib2.HTTPError:
+ return None
+ def getAbsolutePath(self,url):
+ if url[0] == '/':
+ return self.baseUrl + url
+ else:
+ return self.relativeUrl + url
+ def getFromRelative(self,url):
+ return self.get(self.getAbsolutePath(url))
+class IBFilter:
+ def filter(self,post):
+ return None
+class IBFilterSage(IBFilter):
+ def filter(self,post):
+ return post.sage
+class IBFilterLowres(IBFilter):
+ def filter(self,post):
+ return post.pic and post.pic.width < 50
+
+class Thread:
+ def __init__(self,entry,parsers,directlink=None,forcetype=None):
+ self.parser = parsers[entry.type]
+ self.tid = entry.tid
+ self.url = entry.url
+ self.board = entry.board
+ self.chanTag= entry.chanTag
+ self.tags = entry.tags and entry.tags.split(',') or []
+ self.type = entry.type
+ self.forcetype = forcetype
+ self.lastChanged = entry.lastChanged
+ self.filters = []
+ filters = entry.filters and entry.filters.split(',') or []
+ if filters:
+ for f in filters:
+ self.filters.append(GFilters[f])
+
+ self.timeDiff = entry.timeDiff
+ self.directlink = directlink
+ self.loader = Loader()
+ if not self.directlink:
+ self.directlink = self.parser.GetThreadLink(self.url,self.board,self.tid)
+ if self.loader.parseLink(self.directlink)[0]:
+ self.loader = LoaderHTTP(self.directlink)
+ else:
+ self.loader = LoaderLocal(self.directlink)
+ def checkState(self):
+ stat = self.loader.stat(self.directlink)
+ if not stat:
+ return [404]
+ elif stat[0] > self.lastChanged:
+ return [200,stat[0],stat[1]]
+ else:
+ return [304,stat[0],stat[1]]
+ def initialize(self):
+ page = self.loader.get(self.directlink)
+ if page:
+ parser = etree.HTMLParser()
+ if isinstance(page, str):
+ page = page.decode('utf-8')
+ self.document = etree.parse(StringIO.StringIO(page), parser)
+ self.posts = self.parser.GetPostsList(self)
+ self.threadId = self.parser.ResolveSecondaryId(self,[self.tid,self.tid])
+ if self.posts:
+ return True
+ else:
+ return False
+ else:
+ return False
+ def filter(self,post):
+ fl = None
+ if self.filters:
+ for f in self.filters:
+ fl = fl or f.filter(post)
+ return fl
+ def ReplaceReference(self,m):
+ mgg = m.groups()
+ mg = [mgg[1],mgg[2]]
+ tid = self.parser.ResolveSecondaryId(self,[mg[0],mg[0]])
+ if tid:
+ if mg[0] != mg[1]:
+ pid = self.parser.ResolveSecondaryId(self,[mg[0],mg[1]])
+ else:
+ pid = tid
+ if pid:
+ return '<a href="/%s#i%s" onclick="highlight(%s)">>>%s</a>' % (tid, pid, pid, mg[1])
+ print "ERROR! %s/%s does not exist!" % (mg[0],mg[1])
+ return '<a href="/secondaryIndex/%s#i%s" onclick="highlight(%s)">>>%s</a>' % (mg[0], mg[1], mg[1], mg[1])
+
+class WakabaParser(IBParser):
+ replyIdRe = re.compile(r""">>(\d+)""")
+ postIdRe = re.compile(r"""\/(\d+)\.x?h?t?ml?(#i?(\d+))?""")
+ referenceRe = re.compile("""<a [^>]*href="([^"]*/)?(\d+)\.[^"]+"[^>]*>\>\;\>\;(\d+)</a>""")
+ def GetThreadLink(self,url,board,thread):
+ return 'http://'+url+'/'+board+'/res/'+str(thread)+'.html'
+ def GetPostsList(self,thread):
+ posts = thread.document.xpath("/html/body/form//*[@class='reflink']/a")
+ postsList = []
+ if posts:
+ for postA in posts:
+ post = Post()
+ post.thread = thread
+ post.href = postA.get('href')
+ post.reflink = postA.getparent()
+ post.Ids = self.GetPostID(post)
+ post.secondaryIndex = int(post.Ids[1])
+ postsList.append(post)
+ return postsList
+ else:
+ return None
+ def GetImgSrc(self,post):
+ cont = post.l.getparent()
+ for t in cont:
+ if t.tag.lower() == 'a':
+ href = t.get('href')
+ if href and href.find('/src/') != -1:
+ if post.thread.forcetype:
+ return '../src/' + post.thread.loader.parseLink(href)[4]
+ else:
+ return href
+ return None
+
+ def ParseText(self,post):
+ if post.bq is not None:
+ post.bq.tail = ''
+ message = etree.tostring(post.bq, pretty_print=False,encoding='utf-8')
+ if message[:12].lower() == '<blockquote>' and message[-13:].lower() == '</blockquote>':
+ message = message[12:-13]
+ else:
+ print "Cant parse this message : '%s'" % message
+ return None
+ message = self.referenceRe.sub(post.thread.ReplaceReference,message)
+ return message
+ else:
+ return u''
+ def parsePost(self,post):
+ post.bq = self.GetNextTag(post.reflink,'blockquote')
+ post.l = self.GetPreviousTag(post.reflink,'label')
+ post.title = unicodify(post.l[1].text)
+ if not post.title:
+ post.title = u''
+ post.cpn = post.l[2]
+ post.sage = False
+ if len(post.cpn)>0 and post.cpn[0].tag.lower() == 'a':
+ post.cpnHref = post.cpn[0].get('href')
+ if post.cpnHref.find('sage') > -1:
+ post.sage = True
+ post.src = self.GetImgSrc(post)
+ date = post.l[2].tail.encode('utf-8')
+ date = date.replace("\r",'').replace("\n",'')
+ post.date = DTP.getDateTime(date)
+ post.message = unicodify(self.ParseText(post))
+
+class UpdateArchive(Command):
+ # Parser configuration
+ summary = "--NO SUMMARY--"
+ usage = "--NO USAGE--"
+ group_name = "fc"
+ parser = Command.standard_parser(verbose=False)
+ parser.add_option("--mode")
+ parser.add_option("--chan")
+ parser.add_option("--board")
+ parser.add_option("--thread")
+ parser.add_option("--chanTag")
+ parser.add_option("--type")
+ parser.add_option("--tags")
+ parser.add_option("--timeDiff")
+ parser.add_option("--directlink")
+ parser.add_option("--list")
+ parser.add_option("--filters")
+ parser.add_option("--forcetype")
+ parsers = {'wakaba':WakabaParser()}
+ def command(self):
+ """Main command to create a new shell"""
+ self.verbose = 3
+ config_file = 'development.ini'
+ config_name = 'config:%s' % config_file
+ here_dir = os.getcwd()
+ locs = dict(__name__="pylons-admin")
+ conf = appconfig(config_name, relative_to=here_dir)
+ conf.update(dict(app_conf=conf.local_conf,global_conf=conf.global_conf))
+ paste.deploy.config.CONFIG.push_thread_config(conf)
+ sys.path.insert(0, here_dir)
+ wsgiapp = loadapp(config_name, relative_to=here_dir)
+ test_app = paste.fixture.TestApp(wsgiapp)
+ tresponse = test_app.get('/_test_vars')
+ request_id = int(tresponse.body)
+ test_app.pre_request_hook = lambda self:paste.registry.restorer.restoration_end()
+ test_app.post_request_hook = lambda self:paste.registry.restorer.restoration_begin(request_id)
+ paste.registry.restorer.restoration_begin(request_id)
+ egg_info = find_egg_info_dir(here_dir)
+ f = open(os.path.join(egg_info, 'top_level.txt'))
+ packages = [l.strip() for l in f.readlines() if l.strip() and not l.strip().startswith('#')]
+ f.close()
+ found_base = False
+ for pkg_name in packages:
+ # Import all objects from the base module
+ base_module = pkg_name + '.lib.base'
+ found_base = can_import(base_module)
+ if not found_base:
+ # Minimal template
+ base_module = pkg_name + '.controllers'
+ found_base = can_import(base_module)
+
+ if found_base:
+ break
+
+ if not found_base:
+ raise ImportError("Could not import base module. Are you sure this is a Pylons app?")
+
+ base = sys.modules[base_module]
+ base_public = [__name for __name in dir(base) if not \
+ __name.startswith('_') or __name == '_']
+ for name in base_public:
+ locs[name] = getattr(base, name)
+ locs.update(dict(wsgiapp=wsgiapp, app=test_app))
+
+ mapper = tresponse.config.get('routes.map')
+ if mapper:
+ locs['mapper'] = mapper
+
+
+ self.thread = self.options.thread
+ self.chan = self.options.chan
+ self.chanTag = self.options.chanTag
+ self.board = self.options.board
+
+ logging.getLogger('sqlalchemy').setLevel(logging.ERROR)
+ GFilters['sage'] = IBFilterSage()
+ GFilters['lowres'] = IBFilterLowres()
+ #logging.getLogger( 'sqlalchemy').setLevel( logging.NONE )
+ if not self.options.mode or self.options.mode == 'update':
+ self.UpdateArchive()
+ elif self.options.mode == 'add':
+ self.AddToArchive()
+ elif self.options.mode == 'thread':
+ if self.options.list:
+ f = open(self.options.list,'r')
+ tList = f.readlines()
+ else:
+ tList = [self.options.thread]
+ for t in tList:
+ entry = ArchiveList()
+ entry.tid = int(t)
+ entry.url = self.options.chan
+ entry.chanTag = self.options.chanTag
+ entry.board = self.options.board or 'b'
+ entry.tags = self.options.tags or ''
+ entry.type = self.options.type or 'wakaba'
+ entry.filters = self.options.filters or ''
+ entry.timeDiff = self.options.timeDiff or 0
+ entry.lastChanged = datetime.datetime.fromtimestamp(0)
+ print "Processing %s %s %s %s" % (entry.tid,entry.url,entry.chanTag,entry.board)
+ thread = Thread(entry,self.parsers,self.options.directlink,self.options.forcetype)
+ self.processThread(thread)
+
+ def LoadPage(self,thread,chan='2ch.ru',board='b'):
+ self.host = 'http://'+chan
+ if thread:
+ self.path = '/'+board+'/res/'
+ self.url = self.host+self.path+thread+'.html'
+ else:
+ self.path = '/'+board+'/'
+ self.url = self.host+self.path
+ print self.url
+ req = urllib2.Request(self.url)
+ req.add_header('Referer', self.host+'/'+board+'/')
+ f = urllib2.urlopen(req)
+ res = f.read()
+ return res
+
+ def getTags(self,tagsList):
+ tags = []
+ for tagName in tagsList:
+ tag = meta.Session.query(Tag).filter(Tag.tag==tagName).first()
+ if tag:
+ tags.append(tag)
+ else:
+ tags.append(Tag(tagName))
+ return tags
+
+ def processPost(self,post):
+ post.thread.parser.parsePost(post)
+ post.pic = False
+ if post.src:
+ post.pic = self.LoadImage(post)
+ if post.pic == -1:
+ post.pic = None
+ if post.pic:
+ post.picid = post.pic.id
+ print "Thread %s Post %s (Image:%s %s %sx%s) at %s, sage : %s" % (post.Ids[0],post.Ids[1],post.src,post.pic and post.pic.id or 0,post.pic and post.pic.width or 0,post.pic and post.pic.height or 0,post.date,post.sage)
+ if (post.thread.filter(post)):
+ print "Filtered out"
+ print "----------------------"
+ else:
+ if post.Ids[0] == post.Ids[1]:
+ post.parentid = -1
+ post.replyCount = 0
+ post.bumpDate = post.date
+ post.tags = self.getTags([post.thread.chanTag,post.thread.board]+post.thread.tags)
+ post.thread.post = post
+ else:
+ post.parentid = post.thread.post.id
+ if not post.sage:
+ post.thread.post.bumpDate = post.date
+ post.thread.post.replyCount += 1
+ post.uidNumber = 1
+ meta.Session.save(post)
+ meta.Session.commit()
+ idList[post.Ids[1]]=[post.id,post.Ids[0]]
+ print "Saved in DB as %s/%s" % (post.id,post.parentid)
+ print "----------------------"
+
+ def processThread(self,thread):
+ if thread.initialize():
+ if thread.threadId:
+ thread.post = meta.Session.query(Post).get(thread.threadId)
+ lastPost = meta.Session.query(Post).filter(Post.parentid==thread.post.id).filter(Post.secondaryIndex>0).order_by(Post.secondaryIndex.desc()).first()
+ if lastPost:
+ lastId = lastPost.secondaryIndex
+ else:
+ lastId = int(thread.tid)
+ else:
+ lastId = 0
+ skipped = 0
+ for post in thread.posts:
+ if int(post.Ids[1]) > lastId:
+ if skipped:
+ print "Skipped %s out of %s posts" % (skipped,len(thread.posts))
+ skipped=0
+ self.processPost(post)
+ else:
+ skipped += 1
+ if skipped:
+ print "Skipped %s out of %s posts" % (skipped,len(thread.posts))
+
+
+ def LoadImage(self,post):
+ url = post.thread.loader.getAbsolutePath(post.src)
+ fileName = post.thread.loader.parseLink(url)[4]
+ res = post.thread.loader.getFromRelative(post.src)
+ if res:
+ localFilePath = os.path.join(g.OPT.uploadPath, fileName)
+ localFile = open(localFilePath,'wb')
+ localFile.write(res)
+ localFile.close()
+ file = FieldStorageLike(fileName,localFilePath)
+ fileDescriptors = self.processFile(file, 200)
+ pic = False
+ if fileDescriptors:
+ pic = fileDescriptors[0]
+ fileHolder = fileDescriptors[1]
+ if pic and pic != -1 and fileHolder:
+ fileHolder.disableDeletion()
+ return pic
+ else:
+ return None
+
+ def processFile(self, file, thumbSize=250):
+ if isinstance(file, cgi.FieldStorage) or isinstance(file,FieldStorageLike):
+ # We should check whether we got this file already or not
+ # If we dont have it, we add it
+ name = str(long(time.time() * 10**7))
+ ext = file.filename.rsplit('.',1)[:0:-1]
+
+ if ext:
+ ext = ext[0].lstrip(os.sep)
+ else:
+ # Panic, no extention found
+ ext = ''
+ return ''
+
+ # Make sure its something we want to have
+
+ extParams = meta.Session.query(Extension).filter(Extension.ext==ext).first()
+
+ if not extParams:
+ return False
+
+ localFilePath = os.path.join(g.OPT.uploadPath, name + '.' + ext)
+ localFile = open(localFilePath,'w+b')
+ shutil.copyfileobj(file.file, localFile)
+ localFile.seek(0)
+ md5 = hashlib.md5(localFile.read()).hexdigest()
+ file.file.close()
+ localFile.close()
+
+ pic = meta.Session.query(Picture).filter(Picture.md5==md5).first()
+
+ if pic:
+ os.unlink(localFilePath)
+ return [pic, False]
+
+ try:
+ if extParams.type == 'image':
+ thumbFilePath = name + 's.' + ext
+ size = self.makeThumbnail(localFilePath, os.path.join(g.OPT.uploadPath,thumbFilePath), (thumbSize,thumbSize))
+ else:
+ if extParams.type == 'image-jpg':
+ thumbFilePath = name + 's.jpg'
+ size = self.makeThumbnail(localFilePath, os.path.join(g.OPT.uploadPath,thumbFilePath), (thumbSize,thumbSize))
+ else:
+ thumbFilePath = extParams.path
+ size = [0, 0, extParams.thwidth, extParams.thheight]
+ except:
+ return [-1, AngryFileHolder(localFilePath)]
+
+ pic = Picture()
+ pic.path = name + '.' + ext
+ pic.thumpath = thumbFilePath
+ pic.width = size[0]
+ pic.height = size[1]
+ pic.thwidth = size[2]
+ pic.thheight = size[3]
+ pic.extid = extParams.id
+ pic.size = os.stat(localFilePath)[6]
+ pic.md5 = md5
+ meta.Session.save(pic)
+ meta.Session.commit()
+ return [pic, AngryFileHolder(localFilePath, pic)]
+ else:
+ return False
+
+ def makeThumbnail(self, source, dest, maxSize):
+ sourceImage = Image.open(source)
+ size = sourceImage.size
+ if sourceImage:
+ sourceImage.thumbnail(maxSize,Image.ANTIALIAS)
+ sourceImage.save(dest)
+ return size + sourceImage.size
+ else:
+ return []
+ def AddToArchive(self):
+ if self.options.thread and self.options.chan and self.options.chanTag:
+ if not self.options.board:
+ self.options.board = 'b'
+ entry = meta.Session.query(ArchiveList).filter(ArchiveList.tid==self.options.thread).filter(ArchiveList.url==self.options.chan).filter(ArchiveList.board==self.options.board).first()
+ if entry:
+ print "Thread is already in the list"
+ else:
+ entry = ArchiveList()
+ entry.tid = self.options.thread
+ entry.url = self.options.chan
+ entry.chanTag = self.options.chanTag
+ entry.board = self.options.board
+ entry.tags = self.options.tags or ''
+ entry.type = self.options.type or 'wakaba'
+ entry.filters = self.options.filters or ''
+ entry.timeDiff = self.options.timeDiff or 0
+ entry.lastChanged = datetime.datetime.fromtimestamp(0)
+ meta.Session.save(entry)
+ meta.Session.commit()
+ else:
+ print "Bad parameters"
+ def UpdateArchive(self):
+ archiveList = meta.Session.query(ArchiveList).all()
+ for entry in archiveList:
+ thread = Thread(entry,self.parsers)
+ state = thread.checkState()
+ print "*** Thread %s HTTP %s" % (thread.directlink,state[0])
+ if state[0] == 404:
+ meta.Session.delete(entry)
+ meta.Session.commit()
+ elif state[0] == 200:
+ self.processThread(thread)
+ entry.lastChanged = state[1]
+ meta.Session.commit()