# -*- coding: utf-8 -*-
"""
Library to get and put pages on a MediaWiki wiki.
Contents of the library (objects and functions to be used outside)
Classes:
Page(site, title): A page on a MediaWiki site
DataPage(site, title): A Page object for the data repository.
ImagePage(site, title): An image descriptor Page
Site(lang, fam): A MediaWiki site
Factory functions:
Family(name): Import the named family
getSite(lang, fam): Return a Site instance
Exceptions:
Error: Base class for all exceptions in this module
NoUsername: Username is not in user-config.py
NoPage: Page does not exist on the wiki
NoSuchSite: Site does not exist
IsRedirectPage: Page is a redirect page
IsNotRedirectPage: Page is not a redirect page
LockedPage: Page is locked
SectionError: The section specified in the Page title does not exist
PageNotSaved: Saving the page has failed
EditConflict: PageNotSaved due to edit conflict while uploading
SpamfilterError: PageNotSaved due to MediaWiki spam filter
LongPageError: PageNotSaved due to length limit
ServerError: Got unexpected response from wiki server
BadTitle: Server responded with BadTitle
UserBlocked: Client's username or IP has been blocked
PageNotFound: Page not found in list
Objects:
get_throttle: Call to limit rate of read-access to wiki
put_throttle: Call to limit rate of write-access to wiki
Other functions:
getall(): Load a group of pages
handleArgs(): Process all standard command line arguments (such as
-family, -lang, -log and others)
translate(xx, dict): dict is a dictionary, giving text depending on
language, xx is a language. Returns the text in the most applicable
language for the xx: wiki
setAction(text): Use 'text' instead of "Wikipedia python library" in
edit summaries
setUserAgent(text): Sets the string being passed to the HTTP server as
the User-agent: header. The default is
'')
if Recaptcha.search(data):
raise CaptchaError(
'We have been prompted for a ReCaptcha, but pywikipedia '
'does not yet support ReCaptchas')
def postForm(self, address, predata, sysop=False, cookies=None):
"""Post http form data to the given address at this site.
address - the absolute path without hostname.
predata - a dict or any iterable that can be converted to a dict,
containing keys and values for the http form.
cookies - the cookies to send with the form. If None, send self.cookies
Return a (response, data) tuple, where response is the HTTP
response object and data is a Unicode string containing the
body of the response.
"""
if ('action' in predata) and pywikibot.simulate and \
(predata['action'] in pywikibot.config.actions_to_block) and \
(address not in [self.export_address()]):
output(u'\03{lightyellow}SIMULATION: %s action blocked.\03{default}'
% predata['action'])
import StringIO
f_dummy = StringIO.StringIO()
f_dummy.__dict__.update({u'code': 0, u'msg': u''})
return f_dummy, u''
data = self.urlEncode(predata)
try:
if cookies:
return self.postData(address, data, sysop=sysop,
cookies=cookies)
else:
return self.postData(address, data, sysop=sysop,
cookies=self.cookies(sysop=sysop))
except socket.error, e:
raise ServerError(e)
def postData(self, address, data,
contentType='application/x-www-form-urlencoded',
sysop=False, compress=True, cookies=None):
"""Post encoded data to the given http address at this site.
address is the absolute path without hostname.
data is an ASCII string that has been URL-encoded.
Returns a (response, data) tuple where response is the HTTP
response object and data is a Unicode string containing the
body of the response.
"""
if address[-1] == "?":
address = address[:-1]
headers = {
'User-agent': str(useragent),
'Content-Length': str(len(data)),
'Content-type': str(contentType),
}
if cookies:
headers['Cookie'] = str(cookies)
if compress:
headers['Accept-encoding'] = 'gzip'
#print '%s' % headers
url = '%s://%s%s' % (self.protocol(), self.hostname(), address)
# Try to retrieve the page until it was successfully loaded (just in
# case the server is down or overloaded).
# Wait for retry_idle_time minutes (growing!) between retries.
retry_idle_time = 1
retry_attempt = 0
while True:
try:
request = urllib2.Request(str(url), str(data), headers)
f = MyURLopener.open(request)
# read & info can raise socket.error
text = f.read()
headers = f.info()
break
except KeyboardInterrupt:
raise
except urllib2.HTTPError, e:
if e.code in [401, 404]:
debug(u"Got HTTP/%i %s: \n %r"
% (e.code, e.message, e.read()))
raise PageNotFound(u'Page %s could not be retrieved. Check '
u'your family file ?' % url)
# just check for HTTP Status 500 (Internal Server Error)?
elif e.code in [500, 502, 504]:
output(u'HTTPError: %s %s' % (e.code, e.msg))
if config.retry_on_fail:
retry_attempt += 1
if retry_attempt > config.maxretries:
raise MaxTriesExceededError()
warning(u"Could not open '%s'.\nMaybe the server is "
"down. Retrying in %i minutes..."
% (url, retry_idle_time))
time.sleep(retry_idle_time * 60)
# Next time wait longer, but not longer than half an
# hour
retry_idle_time *= 2
if retry_idle_time > 30:
retry_idle_time = 30
continue
raise
else:
output(u"Result: %s %s" % (e.code, e.msg))
raise
except Exception:
exception(tb=pywikibot.verbose)
if config.retry_on_fail:
retry_attempt += 1
if retry_attempt > config.maxretries:
raise MaxTriesExceededError()
warning(u"Could not open '%s'. Maybe the server or\nyour "
u"connection is down. Retrying in %i minutes..."
% (url, retry_idle_time))
time.sleep(retry_idle_time * 60)
retry_idle_time *= 2
if retry_idle_time > 30:
retry_idle_time = 30
continue
raise
# check cookies return or not, if return, send its to update.
if hasattr(f, 'sheaders'):
ck = f.sheaders
else:
ck = f.info().getallmatchingheaders('set-cookie')
if ck:
Reat = re.compile(': (.*?)=(.*?); (expires=(.*?);)?')
tmpc = {}
for d in ck:
m = Reat.search(d)
if m:
exps = m.group(4)
if exps:
if (datetime.datetime.strptime(exps, '%a, %d-%b-%Y %H:%M:%S %Z')
- datetime.datetime.utcnow()) < datetime.timedelta(seconds=1):
continue
tmpc[m.group(1)] = m.group(2)
if self.cookies(sysop):
self.updateCookies(tmpc, sysop)
resContentType = headers.get('content-type', '')
contentEncoding = headers.get('content-encoding', '')
# Ensure that all sent data is received
# In rare cases we found a douple Content-Length in the header.
# We need to split it to get a value
content_length = int(headers.get('content-length', '0').split(',')[0])
if content_length != len(text) and 'content-length' in headers:
warning(u'len(text) does not match content-length: %s != %s'
% (len(text), content_length))
return self.postData(address, data, contentType, sysop, compress,
cookies)
if compress and contentEncoding == 'gzip':
text = decompress_gzip(text)
R = re.compile('charset=([^\'\";]+)')
m = R.search(resContentType)
if m:
charset = m.group(1)
else:
if verbose:
warning(u"No character set found.")
# UTF-8 as default
charset = 'utf-8'
# Check if this is the charset we expected
self.checkCharset(charset)
# Convert HTML to Unicode
try:
text = unicode(text, charset, errors='strict')
except UnicodeDecodeError:
if verbose:
exception()
error(u'Invalid characters found on %s://%s%s, replaced by \\ufffd.'
% (self.protocol(), self.hostname(), address))
# We use error='replace' in case of bad encoding.
text = unicode(text, charset, errors='replace')
# If a wiki page, get user data
self._getUserDataOld(text, sysop=sysop)
return f, text
## @deprecated("pywikibot.comms.http.request") # in 'compat' not yet...
def getUrl(self, path, retry=None, sysop=False, data=None, compress=True,
no_hostname=False, cookie_only=False, refer=None,
back_response=False):
"""
Low-level routine to get a URL from the wiki. Tries to login if it is
another wiki.
Parameters:
path - The absolute path, without the hostname.
retry - If True, retries loading the page when a network error
occurs.
sysop - If True, the sysop account's cookie will be used.
data - An optional dict providing extra post request
parameters.
cookie_only - Only return the cookie the server sent us back
Returns the HTML text of the page converted to unicode.
"""
from pywikibot.comms import http
f, text = http.request(self, path, retry, sysop, data, compress,
no_hostname, cookie_only, refer,
back_response=True)
# If a wiki page, get user data
self._getUserDataOld(text, sysop=sysop)
if back_response:
return f, text
return text
def _getUserData(self, text, sysop=False, force=True):
"""
Get the user data from an API query dict.
Parameters:
* text - the page text
* sysop - is the user a sysop?
"""
index = self._userIndex(sysop)
# Check for blocks
if 'blockedby' in text and not self._isBlocked[index]:
# Write a warning if not shown earlier
if sysop:
account = 'Your sysop account'
else:
account = 'Your account'
warning(u'\n%s on %s is blocked by %s.\nReason: %s\n'
u'Editing using this account will stop the run.\n'
% (account, self, text['blockedby'], text['blockreason']))
self._isBlocked[index] = 'blockedby' in text
# Check for new messages, the data must had key 'messages' in dict.
if 'messages' in text:
if not self._messages[index]:
# User has *new* messages
if sysop:
output(u'NOTE: You have new messages in your sysop account '
u'on %s' % self)
else:
output(u'NOTE: You have new messages on %s' % self)
self._messages[index] = True
else:
self._messages[index] = False
# Don't perform other checks if the data was already loaded
if self._userData[index] and not force:
return
# Get username.
# The data in anonymous mode had key 'anon'
# if 'anon' exist, username is IP address, not to collect it right now
if 'anon' not in text:
self._isLoggedIn[index] = True
self._userName[index] = text['name']
else:
self._isLoggedIn[index] = False
self._userName[index] = None
# Get user groups and rights
if 'groups' in text:
self._rights[index] = []
for group in text['groups']:
# Convert dictionaries to list items (bug 3311663)
if isinstance(group, dict):
self._rights[index].extend(group.keys())
else:
self._rights[index].append(group)
self._rights[index].extend(text['rights'])
# Warnings
# Don't show warnings for not logged in users, they will just fail
# to do any action
if self._isLoggedIn[index]:
if 'bot' not in self._rights[index] and \
config.notify_unflagged_bot:
# Sysop + bot flag = Sysop flag in MediaWiki < 1.7.1?
if sysop:
output(u'Note: Your sysop account on %s does not have '
u'a bot flag. Its edits will be visible in the '
u'recent changes.' % self)
else:
warning(u'Your account on %s does not have a bot flag. '
u'Its edits will be visible in the recent '
u'changes and it may get blocked.' % self)
if sysop and 'sysop' not in self._rights[index]:
warning(u'Your sysop account on %s does not seem to have '
u'sysop rights. You may not be able to perform any '
u'sysop-restricted actions using it.' % self)
else:
# 'groups' is not exists, set default rights
self._rights[index] = []
if self._isLoggedIn[index]:
# Logged in user
self._rights[index].append('user')
# Assume bot, and thus autoconfirmed
self._rights[index].extend(['bot', 'autoconfirmed'])
if sysop:
# Assume user reported as a sysop indeed has the sysop
# rights
self._rights[index].append('sysop')
# Assume the user has the default rights if API not query back
self._rights[index].extend(['read', 'createaccount', 'edit', 'upload',
'createpage', 'createtalk', 'move',
'upload'])
#remove Duplicate rights
self._rights[index] = list(set(self._rights[index]))
# Get token
if 'preferencestoken' in text:
self._token[index] = text['preferencestoken']
if self._rights[index] is not None:
# Token and rights are loaded - user data is now loaded
self._userData[index] = True
elif self.versionnumber() < 14:
# uiprop 'preferencestoken' is start from 1.14, if 1.8~13,
# we need to use other way to get token
params = {
'action': 'query',
'prop': 'info',
'titles': 'Non-existing page',
'intoken': 'edit',
}
data = query.GetData(params, self,
sysop=sysop)['query']['pages'].values()[0]
if 'edittoken' in data:
self._token[index] = data['edittoken']
self._userData[index] = True
else:
warning(u'Token not found on %s. You will not be able to edit '
u'any page.' % self)
else:
if not self._isBlocked[index]:
warning(u'Token not found on %s. You will not be able to edit '
u'any page.' % self)
def _getUserDataOld(self, text, sysop=False, force=True):
"""
Get the user data from a wiki page data.
Parameters:
* text - the page text
* sysop - is the user a sysop?
"""
index = self._userIndex(sysop)
if '

' not in text:
# Not a wiki page
return
# Check for blocks - but only if version is 1.11 (userinfo is available)
# and the user data was not yet loaded
if self.versionnumber() >= 11 and (not self._userData[index] or force):
blocked = self._getBlock(sysop=sysop)
if blocked and not self._isBlocked[index]:
# Write a warning if not shown earlier
if sysop:
account = 'Your sysop account'
else:
account = 'Your account'
warning(u'%s on %s is blocked. Editing using this account will '
u'stop the run.' % (account, self))
self._isBlocked[index] = blocked
# Check for new messages
if '

' in text:
if not self._messages[index]:
# User has *new* messages
if sysop:
output(u'NOTE: You have new messages in your sysop account '
u'on %s' % self)
else:
output(u'NOTE: You have new messages on %s' % self)
self._messages[index] = True
else:
self._messages[index] = False
# Don't perform other checks if the data was already loaded
if self._userData[index] and not force:
return
# Search for the the user page link at the top.
# Note that the link of anonymous users (which doesn't exist at all
# in Wikimedia sites) has the ID pt-anonuserpage, and thus won't be
# found here.
userpageR = re.compile('

'
% (self.mediawiki_message('nstab-special'),
re.escape(self.mediawiki_message(
'nextpage')).replace('\$1', '.*?')))
if not Rnonext.search(full_returned_html):
break
def prefixindex(self, prefix, namespace=0, includeredirects=True):
"""Yield all pages with a given prefix.
Parameters:
prefix The prefix of the pages.
namespace Namespace number; defaults to 0.
MediaWiki software will only return pages in one namespace
at a time.
If includeredirects is False, redirects will not be found.
If includeredirects equals the string 'only', only redirects
will be found. Note that this has not been tested on older
versions of the MediaWiki code.
It is advised not to use this directly, but to use the
PrefixingPageGenerator from pagegenerators.py instead.
"""
for page in self.allpages(start=prefix, namespace=namespace,
includeredirects=includeredirects):
if page.title(withNamespace=False).startswith(prefix):
yield page
else:
break
def protectedpages(self, namespace=None, type='edit', lvl=0):
""" Yield all the protected pages, using Special:ProtectedPages
* namespace is a namespace number
* type can be 'edit' or 'move
* lvl : protection level, can be 0, 'autoconfirmed', or 'sysop'
"""
# Avoid problems of encoding and stuff like that, let it divided please
url = self.protectedpages_address()
url += '&type=%s&level=%s' % (type, lvl)
# /!\ if namespace seems simpler, but returns false when ns=0
if namespace is not None:
url += '&namespace=%s' % namespace
parser_text = self.getUrl(url)
while True:
m = re.findall(
r'