"""
This file is part of PHP-AST Project by Romain Gaucher (http://rgaucher.info).
PHP-AST is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
Grabber is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with PHP-AST. If not, see .
--------------
preproc.py is a PHP preprocessor for analysis purposes. Since PHP is a crappy language,
it may be really hard to handle it correctly. I am actually fixing some limitation of
my AST converter with this tool for real project analysis (wordpress, wikipedia etc.).
The tool is:
- Simplifying strings (keeps only php variables
- Resolve the includes
- Removing comments
"""
import os, re, sys, string
include = ["include","require"]
inc_once= ["include_once","require_once"]
string_1 = re.compile("\"(.*)\"",re.I)
string_2 = re.compile("'(.*)'",re.I)
simpleinclude1 = re.compile(r"\"[a-zA-Z0-9_\.]+\"",re.I)
simpleinclude2 = re.compile(r"'[a-zA-Z0-9_\.]+'",re.I)
define = re.compile(r'define\((.*),(.*)\)',re.I)
definelist = {}
php_start = ["", ""]
var = re.compile(r'$([a-zA-Z0-9_]+)', re.I)
letters = re.compile("[a-zA-Z0-9_]")
def is_alpha(c) :
return letters.match(c) is not None
reginc = re.compile(r'(.*)([\W\D;\s]*)(include|require)(_once)?([\s]*)(["\'\(]+)(.*)$', re.I)
def include_in_line(string):
if 'include' in string or 'require' in string:
return reginc.match(string)
return False
#print include_in_line("include(\"../includes/geshi/geshi.php\");")
#sys.exit(0)
def retFirstOccurence(string, array):
pos = []
for a in array:
pos.append(string.find(a))
return min(pos)
def replace_all(s, old, new):
if old == new:
return s
elif old in new:
raise ValueError("old substring can't be part of the replacement substring.")
while old in s:
s = s.replace(old, new)
return s
def include_file(filename):
print "include <-- %s" % filename
if os.path.isfile(filename):
return process(filename)
else:
print " --> Cannot include the file. The file doesn't exist / File not found"
def remove_concate(string):
i = 0
state = "source"
out = ""
com_delim = None
while i < len(string):
if state == "source":
if string[i] != ".":
out += string[i]
if string[i] == '"' or string[i] == "'":
state = "string"
com_delim = string[i]
elif state == "string":
if string[i] == com_delim and string[i-1] != '\\':
state = "source"
com_delim = None
out += string[i]
i+=1
return out
def clean_string(l):
i = 0
state = "source"
in_buff = ""
com_delim = None
max_len = len(l)
while i < max_len:
if state == "comment":
if l[i] == '*':
if l[i+1] =='/' :
state = "source"
i += 1
elif state == "string":
if l[i] == com_delim:
if l[i-1] == '\\':
if l[i-2] != '\\':
in_buff += l[i]
state = "source"
com_delim = None
else:
in_buff += l[i]
state = "source"
com_delim = None
if l[i-2] == '\\':
in_buff += l[i]
state = "source"
com_delim = None
elif l[i] == '$':
in_buff += l[i]
i+=1
# add the variables
while is_alpha(l[i]) and i < max_len:
in_buff += l[i]
i+=1
if l[i] == com_delim:
in_buff += com_delim
state = "source"
else:
in_buff += ' '
else:
# simply in source
if l[i] == '?' and l[i+1] == '>':
#in_buff += ";"
state = "html"
else:
if l[i] == '/' and l[i+1] == '/':
break
elif l[i] == '/' and l[i+1] == '*':
state = "comment"
elif l[i] == '#':
break
elif state != "array" and (l[i] == '"' or l[i] == "'"):
state = "string"
com_delim = l[i]
in_buff += l[i]
elif l[i] =='[' and state != "array":
state = "array"
in_buff += l[i]
elif l[i] == ']' and state == "array":
state = "source"
in_buff += l[i]
else:
in_buff += l[i]
i+=1
return in_buff
def workout_include(string, rootdir):
buff = []
once = False
require = False
if '_once' in string:
once = True
if 'require' in string:
require = True
lookfor = ""
if require:
lookfor = "require"
else:
lookfor = "include"
if once:
lookfor += "_once"
pos = string.find(lookfor)
inStr = string[pos:]
studyStr = inStr[len(lookfor):inStr.find(';')]
# add after and before
savedStudyStr = studyStr
buff.append(string[:string.find(lookfor)])
studyStr = ''.join(studyStr.split())
if len(studyStr) > 0:
if studyStr[0] == '(':
studyStr = studyStr[1:]
studyStr = studyStr[:len(studyStr)-1]
if simpleinclude1.match(studyStr):
out = simpleinclude1.search(studyStr)
toInc = out.group(0)
toInc = toInc[toInc.find('"')+1:toInc.rfind('"')]
if os.path.isfile(rootdir + toInc):
for l in include_file(rootdir + toInc):
buff.append(l)
elif simpleinclude2.match(studyStr):
out = simpleinclude2.search(studyStr)
toInc = out.group(0)
toInc = toInc[toInc.find("'")+1:toInc.rfind("'")]
if os.path.isfile(rootdir + toInc):
for l in include_file(rootdir + toInc):
buff.append(l)
else:
# complex case
#print "\n\n #########################"
#print "CPLX\t",studyStr
studyStr = remove_concate(studyStr)
#print "CPLX\t",studyStr
for k in definelist:
if studyStr.find(k) >= 0:
try:
studyStr = replace_all(studyStr, k, definelist[k])
except ValueError:
continue
#print "CPLX\t",studyStr
studyStr = replace_all(studyStr, "'",'')
studyStr = replace_all(studyStr, '"','')
studyStr = os.path.normpath(rootdir + studyStr)
studyStr = ''.join(studyStr.split())
#print "CPLX\t",studyStr
if os.path.isfile(studyStr):
for l in include_file(studyStr):
buff.append(l)
buff.append("include ('');")
buff.append(string[string.find(lookfor + savedStudyStr) + len (lookfor + savedStudyStr)+1:])
buff = ''.join(buff)
buff = buff.split('\n')
obuff = []
for a in buff:
obuff.append(a + '\n')
return obuff
def clean_spec_chars(l):
ret = ""
i = 0
max_len = len(l)
while i < max_len:
if l[i] == '\\':
i +=1
else:
ret += l[i]
i+=1
return ret
def process(fname, checkIncludes = True):
global definelist
html = True
out_buff = []
in_buff = ""
try:
f = open(fname, 'r')
except IOError:
print "Cannot open the file", f
return 1
listStates = ["html", "source", "string", "comment","array"]
state = "html"
com_delim = None
root = fname[:max(fname.rfind('/')+1, fname.rfind('\\')+1)]
for l in f.xreadlines():
cl = clean_string(l)
if checkIncludes and include_in_line(cl):
buff = workout_include(l,root)
for a in buff:
out_buff.append(a)
else:
if 'define' in l:
s = cl[l.find('define'):]
s = s[:s.find(';')]
if define.match(s):
out = define.search(s)
key = out.group(1)
val = out.group(2)
val = replace_all(val, "'",'')
val = replace_all(val, '"','')
key = replace_all(key, "'",'')
key = replace_all(key, '"','')
definelist[key] = val
out_buff.append(l)
# finite state machine
for o in out_buff:
l = o
if "/*" not in o:
l = clean_spec_chars(o)
i,max_len = 0,len(l)
while i < max_len:
try:
if state == "html":
if l[i] == '<' and l[i+1] == '?':
if l[i+2] in ('p','P') and l[i+3]in ('h','H') and l[i+4]in ('p','P'):
i+=4
else:
i+=1
state = "source"
else:
if state == "comment":
if l[i] == '*':
if l[i+1] =='/' :
state = "source"
i += 1
elif state == "string":
if l[i] == com_delim:
if l[i-1] == '\\':
if l[i-2] != '\\':
in_buff += l[i]
state = "source"
com_delim = None
else:
in_buff += l[i]
state = "source"
com_delim = None
if l[i-2] == '\\':
in_buff += l[i]
state = "source"
com_delim = None
elif l[i] == '$':
in_buff += l[i]
i+=1
# add the variables
while is_alpha(l[i]) and i < max_len:
in_buff += l[i]
i+=1
if l[i] == com_delim:
in_buff += com_delim
state = "source"
else:
in_buff += ' '
else:
# simply in source
if l[i] == '?' and l[i+1] == '>':
#in_buff += ";"
state = "html"
else:
if l[i] == '/' and l[i+1] == '/':
break
elif l[i] == '/' and l[i+1] == '*':
state = "comment"
elif l[i] == '#':
break
elif state != "array" and (l[i] == '"' or l[i] == "'"):
state = "string"
com_delim = l[i]
in_buff += l[i]
elif l[i] =='[' and state != "array":
state = "array"
in_buff += l[i]
elif l[i] == ']' and state == "array":
state = "source"
in_buff += l[i]
else:
in_buff += l[i]
i += 1
except IndexError:
continue
# remove the 'or die'
in_buff = srcToLower(in_buff)
in_buff = replace_die(in_buff)
in_buff = in_buff.replace("or die(\"\" . mysql_error(\"\"))", "")
in_buff = in_buff.replace("or die (mysql_error())", "")
return in_buff
regword = re.compile('([$a-zA-Z0-9_]+)',re.I)
def myRegLower(x):
x = x.group()
if '$' in x:
return x
return x.lower()
def srcToLower(buff):
return regword.sub(myRegLower ,buff)
regdir = re.compile(r'or([\s]+)die([\s]*)\(([$_\w\d.\'"\s]+)\)',re.I)
def replace_die(buff):
return regdir.sub(lambda x: '',buff)
def pp_file(fname):
s = process(fname)
nname = fname[:fname.rfind('.php')] + '.preproc.php'
out = open(nname,"w")
out.write("")
out.close()
return nname
if __name__ == "__main__":
fname = sys.argv[1]
pp_file(fname)