Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Create elements chain string as we store it #823

Merged
merged 16 commits into from
Nov 20, 2023
Merged
Show file tree
Hide file tree
Changes from 10 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions src/__tests__/autocapture-utils.js
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import {
isAngularStyleAttr,
getNestedSpanText,
getDirectAndNestedSpanText,
getElementsChainString,
} from '../autocapture-utils'

describe(`Autocapture utility functions`, () => {
Expand Down Expand Up @@ -397,4 +398,19 @@ describe(`Autocapture utility functions`, () => {
expect(getNestedSpanText(parent)).toBe('test test2')
})
})

describe('getElementsChainString', () => {
it('should return an empty string with no elements', () => {
const elementChain = getElementsChainString([])

expect(elementChain).toEqual('')
})
it('should process elements correctly', () => {
const elementChain = getElementsChainString([
{ tag_name: 'div', nth_child: 1, nth_of_type: 2, $el_text: 'text' },
])

expect(elementChain).toEqual('div:text="text"nth-child="1"nth-of-type="2"')
})
})
})
38 changes: 17 additions & 21 deletions src/__tests__/autocapture.js
Original file line number Diff line number Diff line change
Expand Up @@ -706,10 +706,10 @@ describe('Autocapture system', () => {
const props = captureArgs[1]
expect(event).toBe('$autocapture')
expect(props['$event_type']).toBe('click')
expect(props['$elements'][0]).toHaveProperty('attr__href', 'http://test.com')
expect(props['$elements'][1]).toHaveProperty('tag_name', 'span')
expect(props['$elements'][2]).toHaveProperty('tag_name', 'div')
expect(props['$elements'][props['$elements'].length - 1]).toHaveProperty('tag_name', 'body')
expect(props['$elements_chain']).toContain('attr__href="http://test.com"')
expect(props['$elements_chain']).toContain('span:')
expect(props['$elements_chain']).toContain('div:')
expect(props['$elements_chain']).toContain('body:')
})

it('truncate any element property value to 1024 bytes', () => {
Expand All @@ -733,7 +733,7 @@ describe('Autocapture system', () => {
const captureArgs = lib.capture.args[0]
const props = captureArgs[1]
expect(longString).toBe('prop'.repeat(400))
expect(props['$elements'][0]).toHaveProperty('attr__data-props', 'prop'.repeat(256) + '...')
expect(props['$elements_chain']).toContain('attr__data-props="' + 'prop'.repeat(256) + '..."')
})

it('gets the href attribute from parent anchor tags', () => {
Expand All @@ -750,7 +750,7 @@ describe('Autocapture system', () => {
},
lib
)
expect(getCapturedProps(lib.capture)['$elements'][0]).toHaveProperty('attr__href', 'http://test.com')
expect(getCapturedProps(lib.capture)['$elements_chain']).toContain('attr__href="http://test.com"')
})

it('does not capture href attribute values from password elements', () => {
Expand Down Expand Up @@ -784,7 +784,7 @@ describe('Autocapture system', () => {
},
lib
)
expect(getCapturedProps(lib.capture)['$elements'][0]).not.toHaveProperty('attr__href')
expect(getCapturedProps(lib.capture)['$elements_chain']).not.toContain('a:attr__href')
})

it('does not capture href attribute values that look like credit card numbers', () => {
Expand All @@ -801,7 +801,7 @@ describe('Autocapture system', () => {
},
lib
)
expect(getCapturedProps(lib.capture)['$elements'][0]).not.toHaveProperty('attr__href')
expect(getCapturedProps(lib.capture)['$elements_chain']).not.toContain('a:attr__href')
})

it('does not capture href attribute values that look like social-security numbers', () => {
Expand All @@ -818,7 +818,7 @@ describe('Autocapture system', () => {
},
lib
)
expect(getCapturedProps(lib.capture)['$elements'][0]).not.toHaveProperty('attr__href')
expect(getCapturedProps(lib.capture)['$elements_chain']).not.toContain('a:attr__href')
})

it('correctly identifies and formats text content', () => {
Expand Down Expand Up @@ -866,7 +866,7 @@ describe('Autocapture system', () => {
const props1 = getCapturedProps(lib.capture)
const text1 =
"Some super duper really long Text with new lines that we'll strip out and also we will want to make this text shorter since it's not likely people really care about text content that's super long and it also takes up more space and bandwidth. Some super d"
expect(props1['$elements'][0]).toHaveProperty('$el_text', text1)
expect(props1['$elements_chain']).toContain(`text="${text1}"`)
expect(props1['$el_text']).toEqual(text1)
lib.capture.resetHistory()

Expand All @@ -876,7 +876,7 @@ describe('Autocapture system', () => {
}
autocapture._captureEvent(e2, lib)
const props2 = getCapturedProps(lib.capture)
expect(props2['$elements'][0]).toHaveProperty('$el_text', 'Some text')
expect(props2['$elements_chain']).toContain('text="Some text"')
expect(props2['$el_text']).toEqual('Some text')
lib.capture.resetHistory()

Expand All @@ -886,8 +886,7 @@ describe('Autocapture system', () => {
}
autocapture._captureEvent(e3, lib)
const props3 = getCapturedProps(lib.capture)
expect(props3['$elements'][0]).toHaveProperty('$el_text', '')
expect(props3).not.toHaveProperty('$el_text')
expect(props3['$elements_chain']).not.toContain('text=""')
})

it('does not capture sensitive text content', () => {
Expand Down Expand Up @@ -916,8 +915,7 @@ describe('Autocapture system', () => {
}
autocapture._captureEvent(e1, lib)
const props1 = getCapturedProps(lib.capture)
expect(props1['$elements'][0]).toHaveProperty('$el_text')
expect(props1['$elements'][0]['$el_text']).toMatch(/Why\s+hello\s+there/)
expect(props1['$elements_chain']).toContain('text="Why hello there"')
lib.capture.resetHistory()

const e2 = {
Expand All @@ -926,8 +924,7 @@ describe('Autocapture system', () => {
}
autocapture._captureEvent(e2, lib)
const props2 = getCapturedProps(lib.capture)
expect(props2['$elements'][0]).toHaveProperty('$el_text')
expect(props2['$elements'][0]['$el_text']).toMatch(/Why\s+hello\s+there/)
expect(props2['$elements_chain']).toContain('text="Why hello there"')
lib.capture.resetHistory()

const e3 = {
Expand All @@ -936,8 +933,7 @@ describe('Autocapture system', () => {
}
autocapture._captureEvent(e3, lib)
const props3 = getCapturedProps(lib.capture)
expect(props3['$elements'][0]).toHaveProperty('$el_text')
expect(props3['$elements'][0]['$el_text']).toMatch(/Why\s+hello\s+there/)
expect(props3['$elements_chain']).toContain('text="Why hello there"')
})

it('should capture a submit event with form field props', () => {
Expand Down Expand Up @@ -1031,7 +1027,7 @@ describe('Autocapture system', () => {
autocapture._captureEvent(e1, newLib)

const props1 = getCapturedProps(newLib.capture)
expect('attr__formmethod' in props1['$elements'][0]).toEqual(false)
expect(props1['$elements_chain']).not.toContain('attr__formmethod')
})

it('does not capture any textContent if mask_all_text is set', () => {
Expand Down Expand Up @@ -1060,7 +1056,7 @@ describe('Autocapture system', () => {
autocapture._captureEvent(e1, newLib)
const props1 = getCapturedProps(newLib.capture)

expect(props1['$elements'][0]).not.toHaveProperty('$el_text')
expect(props1['$elements_chain']).not.toHaveProperty('text')
})
})

Expand Down
102 changes: 99 additions & 3 deletions src/autocapture-utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,11 @@
* @param {Element} el - element to get the className of
* @returns {string} the element's class
*/
import { AutocaptureConfig } from 'types'
import { _each, _includes, _trim } from './utils'

import { _isNull, _isString, _isUndefined } from './utils/type-utils'
import { AutocaptureConfig, Properties } from 'types'
import { _each, _entries, _includes, _trim } from './utils'

import { _isArray, _isNull, _isString, _isUndefined } from './utils/type-utils'
import { logger } from './utils/logger'

export function getClassName(el: Element): string {
Expand Down Expand Up @@ -345,3 +346,98 @@ export function getNestedSpanText(target: Element): string {
}
return text
}

/*
Back in the day storing events in Postgres we use Elements for autocapture events.
Now we're using elements_chain. We used to do this parsing/processing during ingestion.
This code is just copied over from ingestion, but we should optimize it
to create elements_chain string directly.
*/
export function getElementsChainString(elements: Properties[]): string {
return elementsToString(extractElements(elements))
}

// This interface is called 'Element' in plugin-scaffold https://github.com/PostHog/plugin-scaffold/blob/b07d3b879796ecc7e22deb71bf627694ba05386b/src/types.ts#L200
// However 'Element' is a DOM Element when run in the browser, so we have to rename it
interface PHElement {
text?: string
tag_name?: string
href?: string
attr_id?: string
attr_class?: string[]
nth_child?: number
nth_of_type?: number
attributes?: Record<string, any>
event_id?: number
order?: number
group_id?: number
}

function escapeQuotes(input: string): string {
return input.replace(/"|\\"/g, '\\"')
}

function elementsToString(elements: PHElement[]): string {
const ret = elements.map((element) => {
let el_string = ''
if (element.tag_name) {
el_string += element.tag_name
}
if (element.attr_class) {
element.attr_class.sort()
for (const single_class of element.attr_class) {
el_string += `.${single_class.replace(/"/g, '')}`
}
}
const attributes: Record<string, any> = {
...(element.text ? { text: element.text } : {}),
'nth-child': element.nth_child ?? 0,
'nth-of-type': element.nth_of_type ?? 0,
...(element.href ? { href: element.href } : {}),
...(element.attr_id ? { attr_id: element.attr_id } : {}),
...element.attributes,
}
const sortedAttributes: Record<string, any> = {}
_entries(attributes)
.sort(([a], [b]) => a.localeCompare(b))
.forEach(
([key, value]) => (sortedAttributes[escapeQuotes(key.toString())] = escapeQuotes(value.toString()))
)
el_string += ':'
el_string += _entries(attributes)
.map(([key, value]) => `${key}="${value}"`)
.join('')
return el_string
})
return ret.join(';')
}

function extractElements(elements: Properties[]): PHElement[] {
return elements.map((el) => {
const response = {
text: el['$el_text']?.slice(0, 400),
tag_name: el['tag_name'],
href: el['attr__href']?.slice(0, 2048),
attr_class: extractAttrClass(el),
attr_id: el['attr__id'],
nth_child: el['nth_child'],
nth_of_type: el['nth_of_type'],
attributes: {} as { [id: string]: any },
}
_entries(el)
.filter(([key]) => key.indexOf('attr__') === 0)
.forEach(([key, value]) => (response.attributes[key] = value))
return response
})
}

function extractAttrClass(el: Properties): PHElement['attr_class'] {
const attr_class = el['attr__class']
if (!attr_class) {
return undefined
} else if (_isArray(attr_class)) {
return attr_class
} else {
return attr_class.split(' ')
}
}
3 changes: 2 additions & 1 deletion src/autocapture.ts
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ import {
isAngularStyleAttr,
isDocumentFragment,
getDirectAndNestedSpanText,
getElementsChainString,
} from './autocapture-utils'
import RageClick from './extensions/rageclick'
import { AutocaptureConfig, AutoCaptureCustomProperty, DecideResponse, Properties } from './types'
Expand Down Expand Up @@ -255,7 +256,7 @@ const autocapture = {
const props = _extend(
this._getDefaultProperties(e.type),
{
$elements: elementsJson,
$elements_chain: getElementsChainString(elementsJson),
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we should gradually roll this out to team 2 first for safety, see how capture-rs rollouts are happening

},
elementsJson[0]?.['$el_text'] ? { $el_text: elementsJson[0]?.['$el_text'] } : {},
this._getCustomProperties(targetElementList),
Expand Down
Loading