Skip to content

Commit c72d37d

Browse files
author
David Baker Effendi
authored
[ruby] Regex Match Defines $N Vars (#5303)
This PR implements the other component of regex matching defining global variables. In Ruby, `$1`, `$2`, etc. correspond to the group matched in the last match. This is synonymous to how a `MatchData` object could refer to these matches. This PR models these `nref` objects to `$[1]` and, during a match lowering, defines them to the corresponding index position of the lowered temp match object, i.e., `$[1] = <tmp-0>[1]` where `N` is determined by the number of opening parenthesis (simple heuristic). Additionally, the lowered `match` calls have their `methodFullName` defined for convenient policy/semantic definition creation.
1 parent cf8498a commit c72d37d

File tree

4 files changed

+42
-14
lines changed

4 files changed

+42
-14
lines changed

joern-cli/frontends/rubysrc2cpg/src/main/scala/io/joern/rubysrc2cpg/astcreation/AstCreatorHelper.scala

Lines changed: 16 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -199,8 +199,8 @@ trait AstCreatorHelper(implicit withSchemaValidation: ValidationMode) { this: As
199199
def tmp = SimpleIdentifier()(originSpan.spanStart(tmpName))
200200

201201
val matchCall = {
202-
val code = s"${target.text}.match(${regex.text})"
203-
MemberCall(target, ".", "match", regex :: Nil)(originSpan.spanStart(code))
202+
val code = s"${regex.text}.match(${target.text})"
203+
MemberCall(regex, ".", "match", target :: Nil)(originSpan.spanStart(code))
204204
}
205205
val tmpAssignment = {
206206
val code = s"$tmpName = ${matchCall.text}"
@@ -217,14 +217,24 @@ trait AstCreatorHelper(implicit withSchemaValidation: ValidationMode) { this: As
217217
val tildeCode = s"$$~ = $tmpName"
218218
val tildeAssign = SingleAssignment(globalTilde, "=", tmp)(originSpan.spanStart(tildeCode))
219219

220-
def zero = StaticLiteral(getBuiltInType(Defines.Integer))(originSpan.spanStart("0"))
221-
val tmpIndex0 = IndexAccess(tmp, zero :: Nil)(originSpan.spanStart(s"$tmpName[0]"))
220+
def intLiteral(n: Int) = StaticLiteral(getBuiltInType(Defines.Integer))(originSpan.spanStart(s"$n"))
221+
val tmpIndex0 = IndexAccess(tmp, intLiteral(0) :: Nil)(originSpan.spanStart(s"$tmpName[0]"))
222222

223223
val ampersandCode = s"$$& = $tmpName[0]"
224224
val ampersandAssign = SingleAssignment(globalAmpersand, "=", tmpIndex0)(originSpan.spanStart(ampersandCode))
225+
226+
// use a simple heuristic to determine the N matched groups
227+
val matchGroups = (1 to regex.text.count(_ == '(')).map { idx =>
228+
val matchGroupAsgnCode = s"$$$idx = $tmpName[$idx]"
229+
val matchGroup = MemberAccess(self, ".", "$")(originSpan.spanStart("$"))
230+
val matchGroupIndexN = IndexAccess(matchGroup, intLiteral(idx) :: Nil)(originSpan.spanStart(s"$$[$idx]"))
231+
val tmpIndexN = IndexAccess(tmp, intLiteral(idx) :: Nil)(originSpan.spanStart(s"$tmpName[$idx]"))
232+
SingleAssignment(matchGroupIndexN, "=", tmpIndexN)(originSpan.spanStart(matchGroupAsgnCode))
233+
}.toList
234+
225235
// tmp.begin(0) is the lowered return value of `~=`
226-
val beginCall = MemberCall(tmp, ".", "begin", zero :: Nil)(originSpan.spanStart(s"$tmpName.begin(0)"))
227-
StatementList(tildeAssign :: ampersandAssign :: beginCall :: Nil)(
236+
val beginCall = MemberCall(tmp, ".", "begin", intLiteral(0) :: Nil)(originSpan.spanStart(s"$tmpName.begin(0)"))
237+
StatementList(tildeAssign :: ampersandAssign :: Nil ++ matchGroups :+ beginCall)(
228238
originSpan.spanStart(s"$tildeCode; $ampersandCode")
229239
)
230240
},

joern-cli/frontends/rubysrc2cpg/src/main/scala/io/joern/rubysrc2cpg/astcreation/AstForExpressionsCreator.scala

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -222,7 +222,11 @@ trait AstForExpressionsCreator(implicit withSchemaValidation: ValidationMode) {
222222
} else {
223223
code(n)
224224
}
225-
val call = callNode(n, callCode, n.methodName, XDefines.DynamicCallUnknownFullName, dispatchType)
225+
val call = if (n.isRegexMatch || RubyOperators.regexMethods(n.methodName)) {
226+
callNode(n, callCode, n.methodName, s"${getBuiltInType(Defines.Regexp)}.match", dispatchType)
227+
} else {
228+
callNode(n, callCode, n.methodName, XDefines.DynamicCallUnknownFullName, dispatchType)
229+
}
226230
if methodFullName != XDefines.DynamicCallUnknownFullName then call.possibleTypes(Seq(methodFullName))
227231
if (isStatic) {
228232
callAst(call, argumentAsts, base = Option(baseAst)).copy(receiverEdges = Nil)

joern-cli/frontends/rubysrc2cpg/src/main/scala/io/joern/rubysrc2cpg/parser/RubyJsonToNodeCreator.scala

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -766,10 +766,14 @@ class RubyJsonToNodeCreator(
766766
private def visitNil(obj: Obj): RubyExpression = StaticLiteral(getBuiltInType(Defines.NilClass))(obj.toTextSpan)
767767

768768
private def visitNthRef(obj: Obj): RubyExpression = {
769-
val span = obj.toTextSpan
770-
val name = obj(ParserKeys.Value).num.toInt
771-
val selfBase = SelfIdentifier()(span.spanStart("self"))
772-
MemberAccess(selfBase, ".", s"$$$name")(span)
769+
// We represent $1 as $[1] in order to track these arbitrary numeric accesses in a way the data-flow engine
770+
// understands
771+
val span = obj.toTextSpan
772+
val name = obj(ParserKeys.Value).num.toInt
773+
val selfBase = SelfIdentifier()(span.spanStart("self"))
774+
val amperMemberAccess = MemberAccess(selfBase, ".", "$")(span)
775+
val indexPos = StaticLiteral(getBuiltInType(Defines.Integer))(obj.toTextSpan.spanStart(name.toString))
776+
IndexAccess(amperMemberAccess, indexPos :: Nil)(obj.toTextSpan.spanStart(s"$$[$name]"))
773777
}
774778

775779
private def visitObjectInstantiation(obj: Obj): RubyExpression = {

joern-cli/frontends/rubysrc2cpg/src/test/scala/io/joern/rubysrc2cpg/querying/RegexTests.scala

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ import io.shiftleft.codepropertygraph.generated.{Cpg, Operators}
77
import io.shiftleft.codepropertygraph.generated.nodes.{Call, Identifier, Literal}
88
import io.shiftleft.semanticcpg.language.*
99

10-
class RegexTests extends RubyCode2CpgFixture(withPostProcessing = true) {
10+
class RegexTests extends RubyCode2CpgFixture(withPostProcessing = false) {
1111

1212
"Global regex related variables" should {
1313

@@ -23,12 +23,13 @@ class RegexTests extends RubyCode2CpgFixture(withPostProcessing = true) {
2323
val tmpTarget = tmpInit.target.asInstanceOf[Identifier]
2424
tmpTarget.name shouldBe s"<tmp-$tmpNo>"
2525
val tmpSource = tmpInit.source.asInstanceOf[Call]
26-
tmpSource.code shouldBe s"$expectedSubject.match(/h(el)lo/)"
26+
tmpSource.code shouldBe s"/h(el)lo/.match($expectedSubject)"
2727
tmpSource.name shouldBe "match"
28+
tmpSource.methodFullName shouldBe "__core.Kernel.Regexp.match"
2829

2930
// Now test for the lowered global variable assignments
3031
val ifStmt = cpg.controlStructure.last
31-
inside(ifStmt.whenTrue.assignment.l) { case tildeAsgn :: amperAsgn :: Nil =>
32+
inside(ifStmt.whenTrue.assignment.l) { case tildeAsgn :: amperAsgn :: match1Asgn :: Nil =>
3233
tildeAsgn.code shouldBe s"$$~ = <tmp-$tmpNo>"
3334
val taSource = tildeAsgn.source.asInstanceOf[Identifier]
3435
taSource.name shouldBe s"<tmp-$tmpNo>"
@@ -46,6 +47,15 @@ class RegexTests extends RubyCode2CpgFixture(withPostProcessing = true) {
4647
val aaTarget = amperAsgn.target.asInstanceOf[Call]
4748
aaTarget.methodFullName shouldBe Operators.fieldAccess
4849
aaTarget.code shouldBe "self.$&"
50+
51+
match1Asgn.code shouldBe s"$$1 = <tmp-$tmpNo>[1]"
52+
val match1AsgnSource = match1Asgn.source.asInstanceOf[Call]
53+
match1AsgnSource.methodFullName shouldBe Operators.indexAccess
54+
match1AsgnSource.code shouldBe s"<tmp-$tmpNo>[1]"
55+
56+
val match1AsgnTarget = match1Asgn.target.asInstanceOf[Call]
57+
match1AsgnTarget.methodFullName shouldBe Operators.indexAccess
58+
match1AsgnTarget.code shouldBe "$[1]"
4959
}
5060
inside(ifStmt.whenFalse.assignment.l) { case tildeAsgn :: amperAsgn :: Nil =>
5161
tildeAsgn.code shouldBe "$~ = nil"

0 commit comments

Comments
 (0)