""" This file is part of PHP-AST Project by Romain Gaucher (http://rgaucher.info). PHP-AST is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Grabber is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with PHP-AST. If not, see . -------------- preproc.py is a PHP preprocessor for analysis purposes. Since PHP is a crappy language, it may be really hard to handle it correctly. I am actually fixing some limitation of my AST converter with this tool for real project analysis (wordpress, wikipedia etc.). The tool is: - Simplifying strings (keeps only php variables - Resolve the includes - Removing comments """ import os, re, sys, string include = ["include","require"] inc_once= ["include_once","require_once"] string_1 = re.compile("\"(.*)\"",re.I) string_2 = re.compile("'(.*)'",re.I) simpleinclude1 = re.compile(r"\"[a-zA-Z0-9_\.]+\"",re.I) simpleinclude2 = re.compile(r"'[a-zA-Z0-9_\.]+'",re.I) define = re.compile(r'define\((.*),(.*)\)',re.I) definelist = {} php_start = [""] var = re.compile(r'$([a-zA-Z0-9_]+)', re.I) letters = re.compile("[a-zA-Z0-9_]") def is_alpha(c) : return letters.match(c) is not None reginc = re.compile(r'(.*)([\W\D;\s]*)(include|require)(_once)?([\s]*)(["\'\(]+)(.*)$', re.I) def include_in_line(string): if 'include' in string or 'require' in string: return reginc.match(string) return False #print include_in_line("include(\"../includes/geshi/geshi.php\");") #sys.exit(0) def retFirstOccurence(string, array): pos = [] for a in array: pos.append(string.find(a)) return min(pos) def replace_all(s, old, new): if old == new: return s elif old in new: raise ValueError("old substring can't be part of the replacement substring.") while old in s: s = s.replace(old, new) return s def include_file(filename): print "include <-- %s" % filename if os.path.isfile(filename): return process(filename) else: print " --> Cannot include the file. The file doesn't exist / File not found" def remove_concate(string): i = 0 state = "source" out = "" com_delim = None while i < len(string): if state == "source": if string[i] != ".": out += string[i] if string[i] == '"' or string[i] == "'": state = "string" com_delim = string[i] elif state == "string": if string[i] == com_delim and string[i-1] != '\\': state = "source" com_delim = None out += string[i] i+=1 return out def clean_string(l): i = 0 state = "source" in_buff = "" com_delim = None max_len = len(l) while i < max_len: if state == "comment": if l[i] == '*': if l[i+1] =='/' : state = "source" i += 1 elif state == "string": if l[i] == com_delim: if l[i-1] == '\\': if l[i-2] != '\\': in_buff += l[i] state = "source" com_delim = None else: in_buff += l[i] state = "source" com_delim = None if l[i-2] == '\\': in_buff += l[i] state = "source" com_delim = None elif l[i] == '$': in_buff += l[i] i+=1 # add the variables while is_alpha(l[i]) and i < max_len: in_buff += l[i] i+=1 if l[i] == com_delim: in_buff += com_delim state = "source" else: in_buff += ' ' else: # simply in source if l[i] == '?' and l[i+1] == '>': #in_buff += ";" state = "html" else: if l[i] == '/' and l[i+1] == '/': break elif l[i] == '/' and l[i+1] == '*': state = "comment" elif l[i] == '#': break elif state != "array" and (l[i] == '"' or l[i] == "'"): state = "string" com_delim = l[i] in_buff += l[i] elif l[i] =='[' and state != "array": state = "array" in_buff += l[i] elif l[i] == ']' and state == "array": state = "source" in_buff += l[i] else: in_buff += l[i] i+=1 return in_buff def workout_include(string, rootdir): buff = [] once = False require = False if '_once' in string: once = True if 'require' in string: require = True lookfor = "" if require: lookfor = "require" else: lookfor = "include" if once: lookfor += "_once" pos = string.find(lookfor) inStr = string[pos:] studyStr = inStr[len(lookfor):inStr.find(';')] # add after and before savedStudyStr = studyStr buff.append(string[:string.find(lookfor)]) studyStr = ''.join(studyStr.split()) if len(studyStr) > 0: if studyStr[0] == '(': studyStr = studyStr[1:] studyStr = studyStr[:len(studyStr)-1] if simpleinclude1.match(studyStr): out = simpleinclude1.search(studyStr) toInc = out.group(0) toInc = toInc[toInc.find('"')+1:toInc.rfind('"')] if os.path.isfile(rootdir + toInc): for l in include_file(rootdir + toInc): buff.append(l) elif simpleinclude2.match(studyStr): out = simpleinclude2.search(studyStr) toInc = out.group(0) toInc = toInc[toInc.find("'")+1:toInc.rfind("'")] if os.path.isfile(rootdir + toInc): for l in include_file(rootdir + toInc): buff.append(l) else: # complex case #print "\n\n #########################" #print "CPLX\t",studyStr studyStr = remove_concate(studyStr) #print "CPLX\t",studyStr for k in definelist: if studyStr.find(k) >= 0: try: studyStr = replace_all(studyStr, k, definelist[k]) except ValueError: continue #print "CPLX\t",studyStr studyStr = replace_all(studyStr, "'",'') studyStr = replace_all(studyStr, '"','') studyStr = os.path.normpath(rootdir + studyStr) studyStr = ''.join(studyStr.split()) #print "CPLX\t",studyStr if os.path.isfile(studyStr): for l in include_file(studyStr): buff.append(l) buff.append("include ('');") buff.append(string[string.find(lookfor + savedStudyStr) + len (lookfor + savedStudyStr)+1:]) buff = ''.join(buff) buff = buff.split('\n') obuff = [] for a in buff: obuff.append(a + '\n') return obuff def clean_spec_chars(l): ret = "" i = 0 max_len = len(l) while i < max_len: if l[i] == '\\': i +=1 else: ret += l[i] i+=1 return ret def process(fname, checkIncludes = True): global definelist html = True out_buff = [] in_buff = "" try: f = open(fname, 'r') except IOError: print "Cannot open the file", f return 1 listStates = ["html", "source", "string", "comment","array"] state = "html" com_delim = None root = fname[:max(fname.rfind('/')+1, fname.rfind('\\')+1)] for l in f.xreadlines(): cl = clean_string(l) if checkIncludes and include_in_line(cl): buff = workout_include(l,root) for a in buff: out_buff.append(a) else: if 'define' in l: s = cl[l.find('define'):] s = s[:s.find(';')] if define.match(s): out = define.search(s) key = out.group(1) val = out.group(2) val = replace_all(val, "'",'') val = replace_all(val, '"','') key = replace_all(key, "'",'') key = replace_all(key, '"','') definelist[key] = val out_buff.append(l) # finite state machine for o in out_buff: l = o if "/*" not in o: l = clean_spec_chars(o) i,max_len = 0,len(l) while i < max_len: try: if state == "html": if l[i] == '<' and l[i+1] == '?': if l[i+2] in ('p','P') and l[i+3]in ('h','H') and l[i+4]in ('p','P'): i+=4 else: i+=1 state = "source" else: if state == "comment": if l[i] == '*': if l[i+1] =='/' : state = "source" i += 1 elif state == "string": if l[i] == com_delim: if l[i-1] == '\\': if l[i-2] != '\\': in_buff += l[i] state = "source" com_delim = None else: in_buff += l[i] state = "source" com_delim = None if l[i-2] == '\\': in_buff += l[i] state = "source" com_delim = None elif l[i] == '$': in_buff += l[i] i+=1 # add the variables while is_alpha(l[i]) and i < max_len: in_buff += l[i] i+=1 if l[i] == com_delim: in_buff += com_delim state = "source" else: in_buff += ' ' else: # simply in source if l[i] == '?' and l[i+1] == '>': #in_buff += ";" state = "html" else: if l[i] == '/' and l[i+1] == '/': break elif l[i] == '/' and l[i+1] == '*': state = "comment" elif l[i] == '#': break elif state != "array" and (l[i] == '"' or l[i] == "'"): state = "string" com_delim = l[i] in_buff += l[i] elif l[i] =='[' and state != "array": state = "array" in_buff += l[i] elif l[i] == ']' and state == "array": state = "source" in_buff += l[i] else: in_buff += l[i] i += 1 except IndexError: continue # remove the 'or die' in_buff = srcToLower(in_buff) in_buff = replace_die(in_buff) in_buff = in_buff.replace("or die(\"\" . mysql_error(\"\"))", "") in_buff = in_buff.replace("or die (mysql_error())", "") return in_buff regword = re.compile('([$a-zA-Z0-9_]+)',re.I) def myRegLower(x): x = x.group() if '$' in x: return x return x.lower() def srcToLower(buff): return regword.sub(myRegLower ,buff) regdir = re.compile(r'or([\s]+)die([\s]*)\(([$_\w\d.\'"\s]+)\)',re.I) def replace_die(buff): return regdir.sub(lambda x: '',buff) def pp_file(fname): s = process(fname) nname = fname[:fname.rfind('.php')] + '.preproc.php' out = open(nname,"w") out.write("") out.close() return nname if __name__ == "__main__": fname = sys.argv[1] pp_file(fname)