diff --git a/CHANGELOG.md b/CHANGELOG.md index afff2364564..0510b65ff26 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,7 +14,7 @@ - Set the name based on the transaction name when converting a transaction into a segment span. ([#5961](https://github.com/getsentry/relay/pull/5961)) - Set the segment name for child spans based on the transaction name. ([#5959](https://github.com/getsentry/relay/pull/5959)) - Emit missing outcomes in the Playstation and Minidump endpoints, and add/update outcome discard reasons. ([#5966](https://github.com/getsentry/relay/pull/5966)) -- Correct the PII status for `SpanData` fields. ([#5995](https://github.com/getsentry/relay/pull/5995)) +- Obtain PII values for `SpanData` fields from `sentry-conventions`. ([#5997](https://github.com/getsentry/relay/pull/5997)) **Internal**: diff --git a/relay-conventions/sentry-conventions b/relay-conventions/sentry-conventions index 108b899c2ec..523f1f046f8 160000 --- a/relay-conventions/sentry-conventions +++ b/relay-conventions/sentry-conventions @@ -1 +1 @@ -Subproject commit 108b899c2ec3f78726ade12837a067783df0c551 +Subproject commit 523f1f046f85c959a41bf746e4fb07db9994f69d diff --git a/relay-event-schema/src/processor/attrs.rs b/relay-event-schema/src/processor/attrs.rs index fec2145a20c..9d079c4ca0d 100644 --- a/relay-event-schema/src/processor/attrs.rs +++ b/relay-event-schema/src/processor/attrs.rs @@ -594,10 +594,11 @@ impl<'a> ProcessingState<'a> { /// Derives the attrs for recursion. pub fn inner_attrs(&self) -> Option> { - match self.pii() { - Pii::True => Some(Cow::Borrowed(&PII_TRUE_FIELD_ATTRS)), - Pii::False => None, - Pii::Maybe => Some(Cow::Borrowed(&PII_MAYBE_FIELD_ATTRS)), + match self.attrs().pii { + PiiMode::Static(Pii::True) => Some(Cow::Borrowed(&PII_TRUE_FIELD_ATTRS)), + PiiMode::Static(Pii::False) => None, + PiiMode::Static(Pii::Maybe) => Some(Cow::Borrowed(&PII_MAYBE_FIELD_ATTRS)), + PiiMode::Dynamic(f) => Some(Cow::Owned(DEFAULT_FIELD_ATTRS.pii_dynamic(f))), } } diff --git a/relay-event-schema/src/protocol/span.rs b/relay-event-schema/src/protocol/span.rs index effc369cc71..88c3bc6cd17 100644 --- a/relay-event-schema/src/protocol/span.rs +++ b/relay-event-schema/src/protocol/span.rs @@ -8,7 +8,7 @@ use relay_protocol::{ Annotated, Array, Empty, Error, FromValue, Getter, IntoValue, Object, Val, Value, }; -use crate::processor::ProcessValue; +use crate::processor::{Pii, ProcessValue, ProcessingState}; use crate::protocol::{ EventId, IpAddr, JsonLenientString, LenientString, Measurements, OperationType, OriginType, SpanId, SpanStatus, ThreadId, Timestamp, TraceId, @@ -454,12 +454,32 @@ impl Getter for SentryTags { } } +/// Determines the `Pii` value for a field of [`SpanData`] by looking it up in `relay-conventions`. +/// +/// If the field is not found in the conventions, this returns `Pii::True` +/// as a precaution. +fn span_data_pii_from_conventions(state: &ProcessingState) -> Pii { + fn inner(state: &ProcessingState) -> Option { + // `state.keys().next()` is the _last_ segment in the state's + // path, i.e. the field name. + let key = state.keys().next()?; + + match relay_conventions::attribute_info(key)?.pii { + relay_conventions::Pii::True => Some(Pii::True), + relay_conventions::Pii::False => Some(Pii::False), + relay_conventions::Pii::Maybe => Some(Pii::Maybe), + } + } + + inner(state).unwrap_or(Pii::True) +} + /// Arbitrary additional data on a span. /// /// Besides arbitrary user data, this type also contains SDK-provided fields used by the /// product (see ). #[derive(Clone, Debug, Default, PartialEq, Empty, FromValue, IntoValue, ProcessValue)] -#[metastructure(trim = false, pii = "maybe")] +#[metastructure(trim = false, pii = "span_data_pii_from_conventions")] pub struct SpanData { /// Mobile app start variant. /// @@ -594,11 +614,7 @@ pub struct SpanData { pub gen_ai_response_object: Annotated, /// Whether or not the AI model call's response was streamed back asynchronously - #[metastructure( - field = "gen_ai.response.streaming", - legacy_alias = "ai.streaming", - pii = "false" - )] + #[metastructure(field = "gen_ai.response.streaming", legacy_alias = "ai.streaming")] pub gen_ai_response_streaming: Annotated, /// Total output tokens per seconds throughput @@ -751,15 +767,11 @@ pub struct SpanData { pub db_collection_name: Annotated, /// The sentry environment. - #[metastructure( - field = "sentry.environment", - legacy_alias = "environment", - pii = "false" - )] + #[metastructure(field = "sentry.environment", legacy_alias = "environment")] pub environment: Annotated, /// The release version of the project. - #[metastructure(field = "sentry.release", legacy_alias = "release", pii = "false")] + #[metastructure(field = "sentry.release", legacy_alias = "release")] pub release: Annotated, /// The decoded body size of the response (in bytes). @@ -811,7 +823,7 @@ pub struct SpanData { pub thread_name: Annotated, /// ID of thread from where the span originated. - #[metastructure(field = "thread.id", pii = "false")] + #[metastructure(field = "thread.id")] pub thread_id: Annotated, /// Name of the segment that this span belongs to (see `segment_id`). @@ -831,19 +843,19 @@ pub struct SpanData { pub url_scheme: Annotated, /// User Display - #[metastructure(field = "user", pii = "true")] + #[metastructure(field = "user")] pub user: Annotated, /// User email address. /// /// - #[metastructure(field = "user.email", pii = "true")] + #[metastructure(field = "user.email")] pub user_email: Annotated, /// User’s full name. /// /// - #[metastructure(field = "user.full_name", pii = "true")] + #[metastructure(field = "user.full_name")] pub user_full_name: Annotated, /// Two-letter country code (ISO 3166-1 alpha-2). @@ -873,25 +885,25 @@ pub struct SpanData { /// Unique user hash to correlate information for a user in anonymized form. /// /// - #[metastructure(field = "user.hash", pii = "true")] + #[metastructure(field = "user.hash")] pub user_hash: Annotated, /// Unique identifier of the user. /// /// - #[metastructure(field = "user.id", pii = "true")] + #[metastructure(field = "user.id")] pub user_id: Annotated, /// Short name or login/username of the user. /// /// - #[metastructure(field = "user.name", pii = "true")] + #[metastructure(field = "user.name")] pub user_name: Annotated, /// Array of user roles at the time of the event. /// /// - #[metastructure(field = "user.roles", pii = "true")] + #[metastructure(field = "user.roles")] pub user_roles: Annotated>, /// Exclusive Time @@ -899,19 +911,24 @@ pub struct SpanData { pub exclusive_time: Annotated, /// Profile ID - #[metastructure(field = "profile_id", pii = "false")] + #[metastructure( + field = "profile_id", + // This field is not defined in conventions, so we need to set + // PII explicitly. + pii = "false" + )] pub profile_id: Annotated, /// Replay ID - #[metastructure(field = "sentry.replay_id", legacy_alias = "replay_id", pii = "false")] + #[metastructure(field = "sentry.replay_id", legacy_alias = "replay_id")] pub replay_id: Annotated, /// The sentry SDK (see [`crate::protocol::ClientSdkInfo`]). - #[metastructure(field = "sentry.sdk.name", pii = "false")] + #[metastructure(field = "sentry.sdk.name")] pub sdk_name: Annotated, /// The sentry SDK version (see [`crate::protocol::ClientSdkInfo`]). - #[metastructure(field = "sentry.sdk.version", pii = "false")] + #[metastructure(field = "sentry.sdk.version")] pub sdk_version: Annotated, /// Slow Frames @@ -975,7 +992,7 @@ pub struct SpanData { pub http_query: Annotated, /// The client's IP address. - #[metastructure(field = "client.address", pii = "true")] + #[metastructure(field = "client.address")] pub client_address: Annotated, /// The current route in the application. @@ -1013,7 +1030,6 @@ pub struct SpanData { /// Other fields in `span.data`. #[metastructure( additional_properties, - pii = "true", retain = true, skip_serialization = "null" // applies to child elements )] diff --git a/relay-pii/src/processor.rs b/relay-pii/src/processor.rs index be824b1c942..7aff390aed2 100644 --- a/relay-pii/src/processor.rs +++ b/relay-pii/src/processor.rs @@ -532,6 +532,9 @@ fn apply_regex_to_chunks<'a>( return; } + // ALERT: This logic assumes that `regex` doesn't match a capture + // group starting on a null byte. If you get an error in debug mode + // about `replacement_chunks` not being empty, check the regex. static NULL_SPLIT_RE: OnceLock = OnceLock::new(); let regex = NULL_SPLIT_RE.get_or_init(|| { #[allow(clippy::trivial_regex)] diff --git a/relay-pii/src/regexes.rs b/relay-pii/src/regexes.rs index 5493ddfc88a..2ba28442470 100644 --- a/relay-pii/src/regexes.rs +++ b/relay-pii/src/regexes.rs @@ -291,7 +291,7 @@ regex!( ) ) ( - [^/\\\r\n]+ + [^/\\\r\n\x00]+ ) " ); diff --git a/tests/integration/test_pii.py b/tests/integration/test_pii.py index d634eaa5d66..61802641004 100644 --- a/tests/integration/test_pii.py +++ b/tests/integration/test_pii.py @@ -39,21 +39,53 @@ def test_scrub_span_sentry_tags_advanced_rules(mini_sentry, relay): assert event["spans"][0]["sentry_tags"]["user.geo.subregion"] == "***" -def test_http_query(mini_sentry, relay): +@pytest.mark.parametrize( + "field,pii", + [ + # SpanData field, pii: true in conventions + ("url.full", "true"), + # SpanData field, pii: maybe in conventions + ("gen_ai.input.messages", "maybe"), + # SpanData field, pii: false in conventions + ("sentry.release", "false"), + # SpanData field, missing in conventions, explicitly set on SpanData + ("profile_id", "false"), + # Not a SpanData field, pii: true in conventions + ("ai.warnings", "true"), + # Not a SpanData field, pii: maybe in conventions + ("process.runtime.name", "maybe"), + # Not a SpanData field, pii: false in conventions + ("sentry.cancellation_reason", "false"), + # Not a SpanData field and not defined in conventions + ("madeup.field", "true"), + ], +) +def test_spandata_conventions(mini_sentry, relay, field, pii): project_id = 42 relay = relay( mini_sentry, ) config = mini_sentry.add_basic_project_config(project_id) + config["config"].setdefault( + "datascrubbingSettings", + { + "scrubData": True, + "scrubDefaults": True, + }, + ) + config["config"]["piiConfig"] = { "rules": { "custom_secret": { "type": "pattern", - "pattern": r"(X-Amz-Algorithm|X-Amz-Credential)=[^& ]+", - "redaction": {"method": "mask"}, - } + "pattern": ".*", + "redaction": {"method": "replace", "text": "[REDACTED]"}, + }, + }, + "applications": { + "$string": ["@anything:mask"], + f"'{field}'": ["custom_secret"], }, - "applications": {"$span.data.*": ["custom_secret"]}, } relay.send_event( @@ -64,9 +96,7 @@ def test_http_query(mini_sentry, relay): "timestamp": 1778131375.142457, "start_timestamp": 1778131374.296492, "exclusive_time": 845.965, - "data": { - "http.query": "X-Amz-Algorithm=Some_algorithm&X-Amz-Credential=foobar/19700101/us-east-1/s3/aws4_request&X-Amz-Signature=579375859378367127495789347628374", - }, + "data": {field: "secret value"}, } ], }, @@ -74,10 +104,17 @@ def test_http_query(mini_sentry, relay): event = mini_sentry.get_captured_envelope().get_event() - assert ( - event["spans"][0]["data"]["http.query"] - == "******************************&**********************************************************&X-Amz-Signature=579375859378367127495789347628374" - ) + value = event["spans"][0]["data"][field] + + # pii: true fields should get masked + if pii == "true": + assert value == "************" + # pii: maybe fields should get redacted + elif pii == "maybe": + assert value == "[REDACTED]" + # pii: false fields should be left alone + else: + assert value == "secret value" @pytest.mark.parametrize( diff --git a/tests/integration/test_spansv2.py b/tests/integration/test_spansv2.py index b3747087800..80455c13d92 100644 --- a/tests/integration/test_spansv2.py +++ b/tests/integration/test_spansv2.py @@ -1510,12 +1510,19 @@ def test_spansv2_attribute_normalization( http_result = spans_by_id[http_span_id] assert http_result == { **common, + "_meta": { + "attributes": { + "url.full": { + "value": {"": {"len": 63, "rem": [["@userpath", "s", 29, 35]]}} + } + } + }, "span_id": http_span_id, "attributes": { "sentry.category": {"type": "string", "value": "http"}, "sentry.description": { "type": "string", - "value": "GET https://www.service.io/users/01234-qwerty/settings/98765-adfghj", + "value": "GET https://www.service.io/users/[user]/settings/98765-adfghj", }, "sentry.op": {"type": "string", "value": "http.client"}, "sentry.observed_timestamp_nanos": { @@ -1528,7 +1535,7 @@ def test_spansv2_attribute_normalization( "sentry.domain": {"type": "string", "value": "*.service.io"}, "url.full": { "type": "string", - "value": "https://www.service.io/users/01234-qwerty/settings/98765-adfghj", + "value": "https://www.service.io/users/[user]/settings/98765-adfghj", }, }, }