% re.pl % Andrew Davison, June 1998, dandrew@ratree.psu.ac.th % Prolog predicates for using POSIX regular expressions % Comments and improvements heartily requested. % If you use this code, please mention me. % CHANGES % 17/6/98. Placed a cut at the end of regexpr/4 before % the output bindings are made. This is to stop backtracking % into the stream code which causes input problems in BinProlog. % % 18/6/98. Added regexprall/4 and regvarsall/3. % Modified the string-writing predicates not to output newlines. % Predicates: % regexpr(Regex, ToMatch, Result, PMats) % regexprall(Regex, ToMatch, Total, PMats) % regvars(Regex, ToMatch, StringsList) % regvarsall(Regex, ToMatch, StringsList) % regsub(Regex, Sentence, Sub, NewSentence) % regsuball(Regex, ToMatch, Sub, NewExpr) % regsubF(Regex, ToMatch, Sub, NewExpr) % regsuballF(Regex, ToMatch, Sub, NewExpr) % regexpr/4 calls a C program, regexpr.c, which utilises the rx % POSIX regular expression library. The other Prolog predicates rely % on regexpr/4. % I didn't use the regex POSIX library because it % gave buggy results on my Linux (Debian Linux 1.1). % I used Prolog strings as the data structures % for input and output, which seems to make sense. However, % BinProlog doesn't print a string in "..." format but as % a list of ASCII codes. This is none too friendly, so I % wrote some predicates that prints strings of characters. % String Printing Utilities % ws(String) % wsl(StrList) % wp(PMats) % wr(Result) % --------------------------------------------- % regexpr/4; regexpr(+,+, -, -) % regexpr(Regex, ToMatch, Result, PMats) % Given a regular expression, Regex, and a string to match against, % ToMatch, return a result and a list of matches. % The result can be ok(Number) or err(Fun-name. Errno, Err-string) % PMats will have Number elements; the first being the % match for the entire Regex (this is *ONLY* true if the rx library % is used), and any other bracketed expressions in the Regex. % Each PMats element has the format % reg(StartPosn, EndPosn, SubString) % Counting starts at 0, and EndPosn is after the end of the string. % Examples: % ?- regexpr("\+", "one+two+three", Result, PMats). % Result = ok(1) % PMats = [reg(3, 4,"+")] % but with the string in ASCII % ?- regexpr("([[:alpha:]]*)[[:space:]]*([[:alpha:]]*)","foo Bar",R,P),wp(P),nl. % [reg(0, 7, "foo Bar"), reg(0, 3, "foo"), reg(4, 7, "Bar")] % R=ok(3), % P=[reg(0,7,[102,111,111,32,66,97,114]),reg(0,3,[102,111,111]), % reg(4,7,[66,97,114])] % At the heart of regexpr/4 is a call to the C program regexpr.c % of the form: % regexpr "Regex" "ToMatch" % The output from regexpr.c is in a convenient Prolog term format % and so easy for regexpr/4 to read in. See regexpr.c for more details. regexpr(Regex, ToMatch, Result, PMats) :- appendL(["regexpr """, Regex, """ """, ToMatch, """"], RegStr), current_input(FromKeyBd), name(RegCmd, RegStr), popen(RegCmd, read, Stream), % call regexpr.c set_input(Stream), read(Res), get_pats(Res, PMs), pclose(Stream), set_input(FromKeyBd), !, Result = Res, PMats = PMs. get_pats(err(_,_,_), []). get_pats(ok(Num), PMats) :- get_mpats(0, Num, PMats). get_mpats(Num, Num, []). get_mpats(Cnt, Num, [Pat|Pats]) :- Cnt < Num, read(Pat), Cnt1 is Cnt + 1, get_mpats(Cnt1, Num, Pats). appendL([], []). appendL([X|Rest], List) :- appendL(Rest, RestList), append(X, RestList, List). % --------------------------------------------- % regexprall/4; regexpr(+,+, -, -) % regexprall(Regex, ToMatch, Total, PMats) % Regex and ToMatch as in regexpr/4. regexprall/4 % repeatedly tries to match Regex against ToMatch % (regexpr/4 stops after the first match). % PMats is bound to a list of all the successful matches. % Total is the total number of matches including % substring matches. % Total will be 0, PMatsList be [], if there are % no possible matches. % Example: % ?- regexprall("[[:alpha:]]+", "The Third Man", Tot, PMats), wp(PMats),nl. % [reg(0, 3, "The"), reg(4, 9, "Third"), reg(10, 13, "Man")] % Tot=3, % PMats=[reg(0,3,[84,104,101]),reg(4,9,[84,104,105,114,100]), % reg(10,13,[77,97,110])] regexprall(Regex, ToMatch, Total, FPMs) :- regexpr(Regex, ToMatch, ok(Num), PMats), PMats = [reg(_, End, _)|_], find_last(0, End, ToMatch, EndStr), regexprall(Regex, EndStr, RestTotal, PMs), add_offset(End, PMs, PMs1), append(PMats, PMs1, FPMs), Total is Num + RestTotal. regexprall(_, _, 0, []). add_offset(_, [], []). add_offset(Offset, [reg(S,E,Str)|Regs], [reg(Start,End,Str)|NRegs]) :- Start is Offset + S, End is Offset + E, add_offset(Offset, Regs, NRegs). % --------------------------------------------- % regvars/3; regvars(+, +, -) % regvars(Regex, ToMatch, Strs) % Regex and ToMatch as in regexpr/4. Strs will be bound % to a list of matching substrings. % Examples: % ?- regvars("\+", "one+two+three", Strs). % Strs = ["+"] % but with the string in ASCII % ?- regvars("([[:alpha:]]*)[[:space:]]*([[:alpha:]]*)","foo Bar",P),wsl(P),nl. % ["foo Bar", "foo", "Bar"] % P=[[102,111,111,32,66,97,114],[102,111,111],[66,97,114]] regvars(Regex, ToMatch, Strs) :- regexpr(Regex, ToMatch, ok(_), PMats), get_strs(PMats, Strs). get_strs([], []). get_strs([reg(_, _, Str)|PMats], [Str|Strs]) :- get_strs(PMats, Strs). % --------------------------------------------- % regvarsall/3; regvars(+, +, -) % regvarsall(Regex, ToMatch, Strs) % Regex and ToMatch as in regexpr/4. % Regex is repeatedly matched against ToMatch, and % all the matching substrings are placed in Strs. % Strs will be [] if there are no possible matches. % Example: % ?- regvarsall("[[:digit:]]+", "23/6/1998", Strs), wsl(Strs),nl. % ["23", "6", "1998"] % Strs=[[50,51],[54],[49,57,57,56]] regvarsall(Regex, ToMatch, Strs) :- regexprall(Regex, ToMatch, _, PMats), get_strs(PMats, Strs). % --------------------------------------------- % regsub/4; regsub(+, +, +, -). % regsub(Regex, ToMatch, Sub, NewExpr) % If Regex matches against a substring in ToMatch, it % is replaced by Sub to make NewExpr. % The replacement is only carried out once. % Sub may contain slashed numbers (e.g. \1, \2, .. \9) which % will match against any bracked parts in Regex. % Note: \0 matches the entire Regex. % Examples: % ?- regsub("([^\.]*)\.c", "file.c", "gcc -c \0 -o \1.o", Ns). % Ns = "gcc -c file.c -o file.o" % but with the string in ASCII % ?- regsub("([^\.]*)\.c", "file.c", "gcc -c \0 -o \1.o", Ns), ws(Ns),nl. % "gcc -c file.c -o file.o" % Ns=[103,99,99,32,45,99,32,102,105,108,101,46,99,32,45,111,32,102, % 105,108,101,46,111] % regsub/3 always returns an outermost match, so outer brackets % are not required. This is *ONLY* true for the rx library. % regsub/3 does *not* fail if Regex fails to match against any % part of ToMatch, it just returns ToMatch unchanged in NewExpr. regsub(Regex, ToMatch, Sub, NewExpr) :- regexpr(Regex, ToMatch, ok(Num), PMats), substitute(Sub, PMats, Num, NewStr), PMats = [reg(Start, End, _)|_], divide_str(0, Start, End, ToMatch, StartStr, EndStr), appendL([StartStr, NewStr, EndStr], NewExpr). substitute([], _, _, []). substitute([92, NoChar|Rest], PMats, MaxNum, NewStr) :- % 92 is '\' Num is NoChar - 48, % '0' is ASCII 48 Num >= 0, Num < MaxNum, !, % between 0 - MaxNum-1 pat_select(0, Num, PMats, SubStr), substitute(Rest, PMats, MaxNum, RestStr), append(SubStr, RestStr, NewStr). substitute([Ch|Rest], PMats, MaxNum, [Ch|NewStr]) :- substitute(Rest, PMats, MaxNum, NewStr). pat_select(N, N, [reg(_, _, Str)|_], Str). pat_select(Cnt, N, [_|Pats], Pat) :- Cnt < N, Cnt1 is Cnt + 1, pat_select(Cnt1, N, Pats, Pat). divide_str(Start, Start, End, MatchStr, [], EndStr) :- find_last(Start, End, MatchStr, EndStr). divide_str(Cnt, Start, End, [Ch|MatchStr], [Ch|Rest], EndStr) :- Cnt < Start, Cnt1 is Cnt + 1, divide_str(Cnt1, Start, End, MatchStr, Rest, EndStr). find_last(End, End, EndStr, EndStr). find_last(Cnt, End, [_|MatchStr], EndStr) :- Cnt < End, Cnt1 is Cnt + 1, find_last(Cnt1, End, MatchStr, EndStr). % --------------------------------------------- % regsuball/4; regsuball(+, +, +, -). % regsuball(Regex, ToMatch, Sub, NewExpr) % regsuball/4 has the same inputs as regsub/4, but % tries to repeatedly apply Regex to ToMatch. (regsub/4 % stops after finding the first match). For each match, % the matching text within ToMatch is replaced by Sub. % The final result is returned in NewExpr. % Example: % ?- regsuball("\+", "one+two+three", " ", Ns), ws(Ns). % Ns = "one two three" % The repeated replacement is only carried out on text found % within ToMatch. The matching is not applied to text added % to NewExpr from Sub. regsuball(Regex, ToMatch, Sub, NewExpr) :- regexpr(Regex, ToMatch, ok(Num), PMats), substitute(Sub, PMats, Num, NewStr), PMats = [reg(Start, End, _)|_], divide_str(0, Start, End, ToMatch, StartStr, EndStr), regsuball(Regex, EndStr, Sub, NewEndStr), % different from regsub/4 appendL([StartStr, NewStr, NewEndStr], NewExpr). % " regsuball(_, ToMatch, _, ToMatch). % --------------------------------------------- % regsubF/4; regsubF(+, +, +, -). % regsubF(Regex, ToMatch, Sub, NewExpr) % Just like regsub/4 but fails if NewExpr is no different from % ToMatch; i.e. if no substitution has been carried out regsubF(Regex, ToMatch, Sub, NewExpr) :- regsub(Regex, ToMatch, Sub, NewExpr), ToMatch \= NewExpr. % --------------------------------------------- % regsuballF/4; regsuballF(+, +, +, -). % regsuballF(Regex, ToMatch, Sub, NewExpr) % Just like regsuball/4 but fails if NewExpr is no different from % ToMatch; i.e. if no substitutions have been carried out regsuballF(Regex, ToMatch, Sub, NewExpr) :- regsuball(Regex, ToMatch, Sub, NewExpr), ToMatch \= NewExpr. % --------------------------------------------- % Utilities for writing strings and PMats output more clearly % write a string ws(String) :- name(Atom, String), write('"'), write(Atom), write('"'). % write a list of strings wsl(StrList) :- write('['), write_strlist(StrList), write(']'). write_strlist([]). write_strlist([Str|Strs]) :- ws(Str), try_comma(Strs), write_strlist(Strs). % write a list of reg/3 matches (e.g. a PMats binding) wp(PMats) :- write('['), write_pmats(PMats), write(']'). write_pmats([]). write_pmats([reg(Start, End, String)|RegList]) :- write('reg('), write(Start), write(', '), write(End), write(', '), ws(String), write(')'), try_comma(RegList), write_pmats(RegList). try_comma([]). try_comma([_|_]) :- write(', '). % write a Result term (which contains a string in its err/3 output) wr(ok(Num)) :- write(ok(Num)). wr(err(FunAtom, Num, String)) :- write('err('), write(FunAtom), write(', '), write(Num), write(', '), ws(String), write(')').