User:Benc/Scripts/what links out.py

From Wikipedia, the free encyclopedia
# what_links_out.py
# Author:  Benc (http://en.wikipedia.org/wiki/User:Benc)
# Created: 21 September 2004
# Updated: 21 September 2004
# Purpose: Compile a list of articles that are linked to by a page. Sorted by
#          wiki and by namespace. Warning: may have unexpected results for
#          malformed wikicode.
#
# Usage:   Cut and paste the page source here. Alternately, if you're a
#          programmer using this script as a module, call the reportLinks()
#          function instead.

pageSource="""

"""

# -----------------------------------------------------------------------------

NAMESPACES = [   'Talk',
	'Category',  'Category talk',
	'Help',      'Help talk',
	'Image',     'Image talk',
	'Mediawiki', 'Mediawiki talk',	# Namespace miscapitalization intentional
	'Template',  'Template talk',
	'User',      'User talk',
	'Wikipedia', 'Wikipedia talk']

PSEUDO = {
	'WP':'Wikipedia',
	'CAT':'Category',
	'P':'Wikipedia' } # Proposed WikiProject pseudo-namespace

INTERWIKI = { # this list should probably be expanded
	'b':'[[Wikibooks]]',
	'commons':'Wikimedia Commons',
	'm':'Meta-wiki',
	'q':'[[Wikiquote]]',
	'sep11':'9-11 Memorial',
	'wikt':'[[Wiktionary]]' }

INTERWIKI_PLACEHOLDER = 'ZZZ-interwiki to: '
TRANSLATION_PLACEHOLDER = 'ZZZ-translation to: '

outgoingLinks = []	# Another global variable. Horrible, I know.

def uncapAll(txt):
	return txt.lower()
def capAll(txt):
	return txt.upper()
def capFirst(txt):
	return txt[0].upper() + txt[1:]
def capNamespace(txt):
	return txt.capitalize()

def parsePipe(link):
	i = link.find('|')
	if i < 0:
		return capFirst(link), None
	else:
		return capFirst(link[:i]), link[i+1:]

def parseNamespace(link):
	namespace = ''	# Main article namespace
	i = link.find(':')
	ns = link[:i]
	if i >= 0:
		if capNamespace(ns) in NAMESPACES:
			namespace = capNamespace(ns)
		elif PSEUDO.has_key(capAll(ns)):
			namespace = PSEUDO[capAll(ns)]
		elif INTERWIKI.has_key(uncapAll(ns)):
			namespace = INTERWIKI_PLACEHOLDER + INTERWIKI[uncapAll(ns)]
		elif len(ns) == 2 and ns == uncapAll(ns):
			namespace = TRANSLATION_PLACEHOLDER + ns # fairly safe assumption
		# else the : is not being used as a namespace indicator
	return namespace

def parseOneLink(link):
	namespace = parseNamespace(link)
	trueLink, displayedLink = parsePipe(link)
	return (namespace, link, trueLink, displayedLink)

def parseLinks(depth=0):
	"""Note: this function is recursive to handle wikicode such as
		[[Image:Example.gif|thumb|right|This is an [[example]] image]]
	"""
	global pageSource, outgoingLinks
	
	if depth > 0:
		i, j = pageSource.find('[['), pageSource.find(']]')
		outgoingLinks.append(parseOneLink(pageSource[:j]))
		if (i < 0 and j >=0) or j < i:	# nothing deeper
			pageSource = pageSource[j+2:]
			return
		# else recurse
	
	while 1:
		i = pageSource.find('[[')
		if i < 0:
			break
		pageSource = pageSource[i+2:]
		parseLinks(depth+1)

def reportLinks(src):
	"""Main interface function"""
	
	global pageSource, outgoingLinks
	pageSource = src
	outgoingLinks = []
	
	parseLinks()
	outgoingLinks.sort()
	
	result = 'The following pages are linked to by this page:\n'
	curNamespace = '?'
	translation = False
	for link in outgoingLinks:
		if link[0] <> curNamespace: # transitioning into new namespace
			curNamespace = link[0]
			translation = False
			if curNamespace == '':
				result += '*Main article namespace:\n'
			elif curNamespace.find(INTERWIKI_PLACEHOLDER) >= 0:
				result += '*Interwiki to %s:\n' % (curNamespace[len(INTERWIKI_PLACEHOLDER):])
			elif curNamespace.find(TRANSLATION_PLACEHOLDER) >= 0:
				result += '*Translation: %s\n' % (curNamespace[len(TRANSLATION_PLACEHOLDER):])
				translation = True
			else:
				result += '*%s namespace:\n' % (curNamespace)

		if not translation:
			if link[3] == None:
				result += '**[[%s]]\n' % (link[2])
			else:
				result += '**[[%s]] (%s)\n' % (link[2], link[3])
	return result

if __name__=="__main__":
	print reportLinks(pageSource)



  ←back to main page    talk