"""
    This file is part of PHP-AST Project by Romain Gaucher (http://rgaucher.info).

    PHP-AST is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    Grabber is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with PHP-AST.  If not, see <http://www.gnu.org/licenses/>.

	                           --------------

	preproc.py is a PHP preprocessor for analysis purposes. Since PHP is a crappy language,
	it may be really hard to handle it correctly. I am actually fixing some limitation of
	my AST converter with this tool for real project analysis (wordpress, wikipedia etc.).

	The tool is:
	- Simplifying strings (keeps only php variables
	- Resolve the includes
	- Removing comments
"""
import os, re, sys, string

include = ["include","require"]
inc_once= ["include_once","require_once"]
string_1 = re.compile("\"(.*)\"",re.I)
string_2 = re.compile("'(.*)'",re.I)
simpleinclude1 = re.compile(r"\"[a-zA-Z0-9_\.]+\"",re.I)
simpleinclude2 = re.compile(r"'[a-zA-Z0-9_\.]+'",re.I)
define = re.compile(r'define\((.*),(.*)\)',re.I)
definelist = {}
php_start = ["<?", "<?php"]
php_end   = ["?>"]
var = re.compile(r'$([a-zA-Z0-9_]+)', re.I)
letters = re.compile("[a-zA-Z0-9_]")
def is_alpha(c) :
	return letters.match(c) is not None

reginc = re.compile(r'(.*)([\W\D;\s]*)(include|require)(_once)?([\s]*)(["\'\(]+)(.*)$', re.I)
def include_in_line(string):
	if 'include' in string or 'require' in string:
		return reginc.match(string)
	return False

#print include_in_line("include(\"../includes/geshi/geshi.php\");")
#sys.exit(0)

def retFirstOccurence(string, array):
	pos = []
	for a in array:
		pos.append(string.find(a))
	return min(pos)

def replace_all(s, old, new):
	if old == new:
		return s
	elif old in new:
		raise ValueError("old substring can't be part of the replacement substring.")
	while old in s:
		s = s.replace(old, new)
	return s

def include_file(filename):
	print "include <-- %s" % filename
	if os.path.isfile(filename):
		return process(filename)
	else:
		print " --> Cannot include the file. The file doesn't exist / File not found"

def remove_concate(string):
	i = 0
	state = "source"
	out = ""
	com_delim = None
	while i < len(string):
		if state == "source":
			if string[i] != ".":
				out += string[i]
				if string[i] == '"' or string[i] == "'":
					state = "string"
					com_delim = string[i]
		elif state == "string":
			if string[i] == com_delim and string[i-1] != '\\':
				state = "source"
				com_delim = None
			out += string[i]
		i+=1
	return out


def clean_string(l):
	i = 0
	state = "source"
	in_buff = ""
	com_delim = None
	max_len = len(l)
	while i < max_len:
		if state == "comment":
			if l[i] == '*':
				if l[i+1] =='/' :
					state = "source"
					i += 1
		elif state == "string":
			if l[i] == com_delim:
				if l[i-1] == '\\':
					if l[i-2] != '\\':
						in_buff += l[i]
						state = "source"
						com_delim = None
				else:
					in_buff += l[i]
					state = "source"
					com_delim = None

				if l[i-2] == '\\':
					in_buff += l[i]
					state = "source"
					com_delim = None
			elif l[i] == '$':
				in_buff += l[i]
				i+=1
				# add the variables
				while is_alpha(l[i]) and i < max_len:
					in_buff += l[i]
					i+=1
				if l[i] == com_delim:
					in_buff += com_delim
					state = "source"
				else:
					in_buff += ' '
		else:
			# simply in source
			if l[i] == '?' and l[i+1] == '>':
				#in_buff += ";"
				state = "html"
			else:
				if l[i] == '/' and l[i+1] == '/':
					break
				elif l[i] == '/' and l[i+1] == '*':
					state = "comment"
				elif l[i] == '#':
					break
				elif state != "array" and (l[i] == '"' or l[i] == "'"):
					state = "string"
					com_delim = l[i]
					in_buff += l[i]
				elif l[i] =='[' and state != "array":
					state = "array"
					in_buff += l[i]
				elif l[i] == ']' and state == "array":
					state = "source"
					in_buff += l[i]
				else:
					in_buff += l[i]
		i+=1
	return in_buff


def workout_include(string, rootdir):
	buff = []
	once = False
	require = False
	if '_once' in string:
		once = True
	if 'require' in string:
		require = True
	lookfor = ""
	if require:
		lookfor = "require"
	else:
		lookfor = "include"
	if once:
		lookfor += "_once"
	pos = string.find(lookfor)
	inStr = string[pos:]
	studyStr = inStr[len(lookfor):inStr.find(';')]
	# add after and before
	savedStudyStr = studyStr
	buff.append(string[:string.find(lookfor)])
	studyStr = ''.join(studyStr.split())
	if len(studyStr) > 0:
		if studyStr[0] == '(':
			studyStr = studyStr[1:]
			studyStr = studyStr[:len(studyStr)-1]
		if simpleinclude1.match(studyStr):
			out = simpleinclude1.search(studyStr)
			toInc = out.group(0)
			toInc = toInc[toInc.find('"')+1:toInc.rfind('"')]
			if os.path.isfile(rootdir + toInc):
				for l in include_file(rootdir + toInc):
					buff.append(l)
		elif simpleinclude2.match(studyStr):
			out = simpleinclude2.search(studyStr)
			toInc = out.group(0)
			toInc = toInc[toInc.find("'")+1:toInc.rfind("'")]
			if os.path.isfile(rootdir + toInc):
				for l in include_file(rootdir + toInc):
					buff.append(l)
		else:
			# complex case
			#print "\n\n<START> #########################"
			#print "CPLX\t",studyStr
			studyStr = remove_concate(studyStr)
			#print "CPLX\t",studyStr
			for k in definelist:
				if studyStr.find(k) >= 0:
					try:
						studyStr = replace_all(studyStr, k, definelist[k])
					except ValueError:
						continue
			#print "CPLX\t",studyStr
			studyStr = replace_all(studyStr, "'",'')
			studyStr = replace_all(studyStr, '"','')
			studyStr = os.path.normpath(rootdir + studyStr)
			studyStr = ''.join(studyStr.split())
			#print "CPLX\t",studyStr
			if os.path.isfile(studyStr):
				for l in include_file(studyStr):
					buff.append(l)
	buff.append("include ('');")
	buff.append(string[string.find(lookfor + savedStudyStr) + len (lookfor + savedStudyStr)+1:])

	buff = ''.join(buff)
	buff = buff.split('\n')
	obuff = []
	for a in buff:
		obuff.append(a + '\n')
	return obuff


def clean_spec_chars(l):
	ret = ""
	i = 0
	max_len = len(l)
	while i < max_len:
		if l[i] == '\\':
			i +=1
		else:
			ret += l[i]
		i+=1
	return ret


def process(fname, checkIncludes = True):
	global definelist
	html = True
	out_buff = []
	in_buff = ""
	try:
		f = open(fname, 'r')
	except IOError:
		print "Cannot open the file", f
		return 1
	listStates = ["html", "source", "string", "comment","array"]
	state = "html"
	com_delim = None
	root = fname[:max(fname.rfind('/')+1, fname.rfind('\\')+1)]
	for l in f.xreadlines():
		cl = clean_string(l)
		if checkIncludes and include_in_line(cl):
			buff = workout_include(l,root)
			for a in buff:
				out_buff.append(a)
		else:
			if 'define' in l:
				s = cl[l.find('define'):]
				s = s[:s.find(';')]
				if define.match(s):
					out = define.search(s)
					key = out.group(1)
					val = out.group(2)
					val = replace_all(val, "'",'')
					val = replace_all(val, '"','')
					key = replace_all(key, "'",'')
					key = replace_all(key, '"','')
					definelist[key] = val
			out_buff.append(l)

	# finite state machine
	for o in out_buff:
		l = o
		if "/*" not in o:
			l = clean_spec_chars(o)
		i,max_len = 0,len(l)
		while i < max_len:
			try:
				if state == "html":
					if l[i] == '<' and l[i+1] == '?':
						if l[i+2] in ('p','P') and l[i+3]in ('h','H') and l[i+4]in ('p','P'):
							i+=4
						else:
							i+=1
						state = "source"
				else:
					if state == "comment":
						if l[i] == '*':
							if l[i+1] =='/' :
								state = "source"
								i += 1
					elif state == "string":
						if l[i] == com_delim:
							if l[i-1] == '\\':
								if l[i-2] != '\\':
									in_buff += l[i]
									state = "source"
									com_delim = None
							else:
								in_buff += l[i]
								state = "source"
								com_delim = None

							if l[i-2] == '\\':
								in_buff += l[i]
								state = "source"
								com_delim = None
						elif l[i] == '$':
							in_buff += l[i]
							i+=1
							# add the variables
							while is_alpha(l[i]) and i < max_len:
								in_buff += l[i]
								i+=1
							if l[i] == com_delim:
								in_buff += com_delim
								state = "source"
							else:
								in_buff += ' '
					else:
						# simply in source
						if l[i] == '?' and l[i+1] == '>':
							#in_buff += ";"
							state = "html"
						else:
							if l[i] == '/' and l[i+1] == '/':
								break
							elif l[i] == '/' and l[i+1] == '*':
								state = "comment"
							elif l[i] == '#':
								break
							elif state != "array" and (l[i] == '"' or l[i] == "'"):
								state = "string"
								com_delim = l[i]
								in_buff += l[i]
							elif l[i] =='[' and state != "array":
								state = "array"
								in_buff += l[i]
							elif l[i] == ']' and state == "array":
								state = "source"
								in_buff += l[i]
							else:
								in_buff += l[i]
				i += 1
			except IndexError:
				continue
	# remove the 'or die'
	in_buff = srcToLower(in_buff)
	in_buff = replace_die(in_buff)
	in_buff = in_buff.replace("or die(\"\" . mysql_error(\"\"))", "")
	in_buff = in_buff.replace("or die (mysql_error())", "")
	return in_buff
	

regword = re.compile('([$a-zA-Z0-9_]+)',re.I)
def myRegLower(x):
	x = x.group()
	if '$' in x:
		return x
	return x.lower()
def srcToLower(buff):
	return regword.sub(myRegLower ,buff)

regdir = re.compile(r'or([\s]+)die([\s]*)\(([$_\w\d.\'"\s]+)\)',re.I)
def replace_die(buff):
	return regdir.sub(lambda x: '',buff)

def pp_file(fname):
	s = process(fname)
	nname = fname[:fname.rfind('.php')] + '.preproc.php'
	out = open(nname,"w")
	out.write("<?php\n\n")
	out.write(s)
	out.write("\n\n?>")
	out.close()
	return nname

if __name__ == "__main__":
	fname = sys.argv[1]
	pp_file(fname)




