From 9a441225cb692fa1eb93f314001ddb40e1b5168b Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Thu, 19 Dec 2024 21:59:52 +0000
Subject: [PATCH 1/4] fix(profiler): update memalloc guard [backport 2.18]
 (#11802)

Backport 983c84f5f0981e57a4e9125d2d3ab2367d0df8b5 from #11460 to 2.18.

Previously, the memory allocation profiler would use Python's builtin
thread-local storage interfaces in order to set and get the state of a
thread-local guard.

I've updated a few things here.

* I think get/set idioms are slightly problematic for this type of code,
since it pushes the responsibility of maintaining clean internal state
up to the parent. A consequence of this is that the propagation of the
underlying state _by value_ opens the door for race conditions if
execution changes between contexts (unlikely here, but I think
minimizing indirection is still cleaner). Accordingly, I've updated this
to use native thread-local storage
* Based on @nsrip-dd's observation, I widened the guard over `free()`
operations. I believe this is correct, and if it isn't then the
detriment is performance, not correctness.
* I got rid of the PY37 failovers


We don't have any reproductions for the defects that prompted this
change, but I've been running a patched library in an environment that
_does_ reproduce the behavior, and I haven't seen any defects.

1. I don't believe this patch is harmful, and if our memory allocation
tests pass then I believe it should be fine.
2. I have a reason to believe this fixes a critical defect, which can
cause crashes.


## Checklist
- [x] PR author has checked that all the criteria below are met
- The PR description includes an overview of the change
- The PR description articulates the motivation for the change
- The change includes tests OR the PR description describes a testing
strategy
- The PR description notes risks associated with the change, if any
- Newly-added code is easy to change
- The change follows the [library release note
guidelines](https://ddtrace.readthedocs.io/en/stable/releasenotes.html)
- The change includes or references documentation updates if necessary
- Backport labels are set (if
[applicable](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting))

## Reviewer Checklist
- [x] Reviewer has checked that all the criteria below are met
- Title is accurate
- All changes are related to the pull request's stated goal
- Avoids breaking
[API](https://ddtrace.readthedocs.io/en/stable/versioning.html#interfaces)
changes
- Testing strategy adequately addresses listed risks
- Newly-added code is easy to change
- Release note makes sense to a user of the library
- If necessary, author has acknowledged and discussed the performance
implications of this PR as reported in the benchmarks PR comment
- Backport labels are set in a manner that is consistent with the
[release branch maintenance
policy](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting)

Co-authored-by: David Sanchez <838104+sanchda@users.noreply.github.com>
---
 ddtrace/profiling/collector/_memalloc.c       | 125 ++++++++----
 ddtrace/profiling/collector/_memalloc_heap.c  |  93 +++++++--
 .../profiling/collector/_memalloc_reentrant.c |   3 +
 .../profiling/collector/_memalloc_reentrant.h | 186 +++++++++++++++---
 ddtrace/profiling/collector/_memalloc_tb.c    |   7 +-
 ddtrace/profiling/collector/_pymacro.h        |   4 -
 ...ng-memalloc-segfault-5593ad951405a75d.yaml |   5 +
 setup.py                                      |   5 +-
 8 files changed, 340 insertions(+), 88 deletions(-)
 create mode 100644 ddtrace/profiling/collector/_memalloc_reentrant.c
 create mode 100644 releasenotes/notes/fix-profiling-memalloc-segfault-5593ad951405a75d.yaml

diff --git a/ddtrace/profiling/collector/_memalloc.c b/ddtrace/profiling/collector/_memalloc.c
index 3b7f7db293f..3876517baaf 100644
--- a/ddtrace/profiling/collector/_memalloc.c
+++ b/ddtrace/profiling/collector/_memalloc.c
@@ -42,47 +42,95 @@ static PyObject* object_string = NULL;
 
 #define ALLOC_TRACKER_MAX_COUNT UINT64_MAX
 
+// The data coordination primitives in this and related files are related to a crash we started seeing.
+// We don't have a precise understanding of the causal factors within the runtime that lead to this condition,
+// since the GIL alone was sufficient in the past for preventing this issue.
+// We add an option here to _add_ a crash, in order to observe this condition in a future diagnostic iteration.
+// **This option is _intended_ to crash the Python process** do not use without a good reason!
+static char g_crash_on_mutex_pass_str[] = "_DD_PROFILING_MEMALLOC_CRASH_ON_MUTEX_PASS";
+static const char* g_truthy_values[] = { "1", "true", "yes", "on", "enable", "enabled", NULL }; // NB the sentinel NULL
+static memlock_t g_memalloc_lock;
+
 static alloc_tracker_t* global_alloc_tracker;
 
+// This is a multiplatform way to define an operation to happen at static initialization time
+static void
+memalloc_init(void);
+
+#ifdef _MSC_VER
+#pragma section(".CRT$XCU", read)
+__declspec(allocate(".CRT$XCU")) void (*memalloc_init_func)(void) = memalloc_init;
+
+#elif defined(__GNUC__) || defined(__clang__)
+__attribute__((constructor))
+#else
+#error Unsupported compiler
+#endif
+static void
+memalloc_init()
+{
+    // Check if we should crash the process on mutex pass
+    char* crash_on_mutex_pass_str = getenv(g_crash_on_mutex_pass_str);
+    bool crash_on_mutex_pass = false;
+    if (crash_on_mutex_pass_str) {
+        for (int i = 0; g_truthy_values[i]; i++) {
+            if (strcmp(crash_on_mutex_pass_str, g_truthy_values[i]) == 0) {
+                crash_on_mutex_pass = true;
+                break;
+            }
+        }
+    }
+    memlock_init(&g_memalloc_lock, crash_on_mutex_pass);
+}
+
 static void
 memalloc_add_event(memalloc_context_t* ctx, void* ptr, size_t size)
 {
-    /* Do not overflow; just ignore the new events if we ever reach that point */
-    if (global_alloc_tracker->alloc_count >= ALLOC_TRACKER_MAX_COUNT)
+    uint64_t alloc_count = atomic_add_clamped(&global_alloc_tracker->alloc_count, 1, ALLOC_TRACKER_MAX_COUNT);
+
+    /* Return if we've reached the maximum number of allocations */
+    if (alloc_count == 0)
         return;
 
-    global_alloc_tracker->alloc_count++;
+    // Return if we can't take the guard
+    if (!memalloc_take_guard()) {
+        return;
+    }
 
-    /* Avoid loops */
-    if (memalloc_get_reentrant())
+    // In this implementation, the `global_alloc_tracker` isn't intrinsically protected.  Before we read or modify,
+    // take the lock.  The count of allocations is already forward-attributed elsewhere, so if we can't take the lock
+    // there's nothing to do.
+    if (!memlock_trylock(&g_memalloc_lock)) {
         return;
+    }
 
     /* Determine if we can capture or if we need to sample */
     if (global_alloc_tracker->allocs.count < ctx->max_events) {
-        /* set a barrier so we don't loop as getting a traceback allocates memory */
-        memalloc_set_reentrant(true);
         /* Buffer is not full, fill it */
         traceback_t* tb = memalloc_get_traceback(ctx->max_nframe, ptr, size, ctx->domain);
-        memalloc_set_reentrant(false);
-        if (tb)
+        if (tb) {
             traceback_array_append(&global_alloc_tracker->allocs, tb);
+        }
     } else {
         /* Sampling mode using a reservoir sampling algorithm: replace a random
          * traceback with this one */
-        uint64_t r = random_range(global_alloc_tracker->alloc_count);
+        uint64_t r = random_range(alloc_count);
 
-        if (r < ctx->max_events) {
-            /* set a barrier so we don't loop as getting a traceback allocates memory */
-            memalloc_set_reentrant(true);
+        // In addition to event size, need to check that the tab is in a good state
+        if (r < ctx->max_events && global_alloc_tracker->allocs.tab != NULL) {
             /* Replace a random traceback with this one */
             traceback_t* tb = memalloc_get_traceback(ctx->max_nframe, ptr, size, ctx->domain);
-            memalloc_set_reentrant(false);
+
+            // Need to check not only that the tb returned
             if (tb) {
                 traceback_free(global_alloc_tracker->allocs.tab[r]);
                 global_alloc_tracker->allocs.tab[r] = tb;
             }
         }
     }
+
+    memlock_unlock(&g_memalloc_lock);
+    memalloc_yield_guard();
 }
 
 static void
@@ -98,12 +146,6 @@ memalloc_free(void* ctx, void* ptr)
     alloc->free(alloc->ctx, ptr);
 }
 
-#ifdef _PY37_AND_LATER
-Py_tss_t memalloc_reentrant_key = Py_tss_NEEDS_INIT;
-#else
-int memalloc_reentrant_key = -1;
-#endif
-
 static void*
 memalloc_alloc(int use_calloc, void* ctx, size_t nelem, size_t elsize)
 {
@@ -233,7 +275,10 @@ memalloc_start(PyObject* Py_UNUSED(module), PyObject* args)
 
     global_memalloc_ctx.domain = PYMEM_DOMAIN_OBJ;
 
-    global_alloc_tracker = alloc_tracker_new();
+    if (memlock_trylock(&g_memalloc_lock)) {
+        global_alloc_tracker = alloc_tracker_new();
+        memlock_unlock(&g_memalloc_lock);
+    }
 
     PyMem_GetAllocator(PYMEM_DOMAIN_OBJ, &global_memalloc_ctx.pymem_allocator_obj);
     PyMem_SetAllocator(PYMEM_DOMAIN_OBJ, &alloc);
@@ -258,8 +303,11 @@ memalloc_stop(PyObject* Py_UNUSED(module), PyObject* Py_UNUSED(args))
 
     PyMem_SetAllocator(PYMEM_DOMAIN_OBJ, &global_memalloc_ctx.pymem_allocator_obj);
     memalloc_tb_deinit();
-    alloc_tracker_free(global_alloc_tracker);
-    global_alloc_tracker = NULL;
+    if (memlock_trylock(&g_memalloc_lock)) {
+        alloc_tracker_free(global_alloc_tracker);
+        global_alloc_tracker = NULL;
+        memlock_unlock(&g_memalloc_lock);
+    }
 
     memalloc_heap_tracker_deinit();
 
@@ -310,9 +358,15 @@ iterevents_new(PyTypeObject* type, PyObject* Py_UNUSED(args), PyObject* Py_UNUSE
     if (!iestate)
         return NULL;
 
-    iestate->alloc_tracker = global_alloc_tracker;
     /* reset the current traceback list */
-    global_alloc_tracker = alloc_tracker_new();
+    if (memlock_trylock(&g_memalloc_lock)) {
+        iestate->alloc_tracker = global_alloc_tracker;
+        global_alloc_tracker = alloc_tracker_new();
+        memlock_unlock(&g_memalloc_lock);
+    } else {
+        Py_TYPE(iestate)->tp_free(iestate);
+        return NULL;
+    }
     iestate->seq_index = 0;
 
     PyObject* iter_and_count = PyTuple_New(3);
@@ -326,8 +380,11 @@ iterevents_new(PyTypeObject* type, PyObject* Py_UNUSED(args), PyObject* Py_UNUSE
 static void
 iterevents_dealloc(IterEventsState* iestate)
 {
-    alloc_tracker_free(iestate->alloc_tracker);
-    Py_TYPE(iestate)->tp_free(iestate);
+    if (memlock_trylock(&g_memalloc_lock)) {
+        alloc_tracker_free(iestate->alloc_tracker);
+        Py_TYPE(iestate)->tp_free(iestate);
+        memlock_unlock(&g_memalloc_lock);
+    }
 }
 
 static PyObject*
@@ -442,20 +499,6 @@ PyInit__memalloc(void)
         return NULL;
     }
 
-#ifdef _PY37_AND_LATER
-    if (PyThread_tss_create(&memalloc_reentrant_key) != 0) {
-#else
-    memalloc_reentrant_key = PyThread_create_key();
-    if (memalloc_reentrant_key == -1) {
-#endif
-#ifdef MS_WINDOWS
-        PyErr_SetFromWindowsErr(0);
-#else
-        PyErr_SetFromErrno(PyExc_OSError);
-#endif
-        return NULL;
-    }
-
     if (PyType_Ready(&MemallocIterEvents_Type) < 0)
         return NULL;
     Py_INCREF((PyObject*)&MemallocIterEvents_Type);
diff --git a/ddtrace/profiling/collector/_memalloc_heap.c b/ddtrace/profiling/collector/_memalloc_heap.c
index d6531d7b095..d2a5cc29eee 100644
--- a/ddtrace/profiling/collector/_memalloc_heap.c
+++ b/ddtrace/profiling/collector/_memalloc_heap.c
@@ -9,13 +9,13 @@
 typedef struct
 {
     /* Granularity of the heap profiler in bytes */
-    uint32_t sample_size;
+    uint64_t sample_size;
     /* Current sample size of the heap profiler in bytes */
-    uint32_t current_sample_size;
+    uint64_t current_sample_size;
     /* Tracked allocations */
     traceback_array_t allocs;
     /* Allocated memory counter in bytes */
-    uint32_t allocated_memory;
+    uint64_t allocated_memory;
     /* True if the heap tracker is frozen */
     bool frozen;
     /* Contains the ongoing heap allocation/deallocation while frozen */
@@ -26,8 +26,42 @@ typedef struct
     } freezer;
 } heap_tracker_t;
 
+static char g_crash_on_mutex_pass_str[] = "_DD_PROFILING_MEMHEAP_CRASH_ON_MUTEX_PASS";
+static const char* g_truthy_values[] = { "1", "true", "yes", "on", "enable", "enabled", NULL }; // NB the sentinel NULL
+static memlock_t g_memheap_lock;
+
 static heap_tracker_t global_heap_tracker;
 
+// This is a multiplatform way to define an operation to happen at static initialization time
+static void
+memheap_init(void);
+
+#ifdef _MSC_VER
+#pragma section(".CRT$XCU", read)
+__declspec(allocate(".CRT$XCU")) void (*memheap_init_func)(void) = memheap_init;
+
+#elif defined(__GNUC__) || defined(__clang__)
+__attribute__((constructor))
+#else
+#error Unsupported compiler
+#endif
+static void
+memheap_init()
+{
+    // Check if we should crash the process on mutex pass
+    char* crash_on_mutex_pass_str = getenv(g_crash_on_mutex_pass_str);
+    bool crash_on_mutex_pass = false;
+    if (crash_on_mutex_pass_str) {
+        for (int i = 0; g_truthy_values[i]; i++) {
+            if (strcmp(crash_on_mutex_pass_str, g_truthy_values[i]) == 0) {
+                crash_on_mutex_pass = true;
+                break;
+            }
+        }
+    }
+    memlock_init(&g_memheap_lock, crash_on_mutex_pass);
+}
+
 static uint32_t
 heap_tracker_next_sample_size(uint32_t sample_size)
 {
@@ -119,20 +153,30 @@ heap_tracker_thaw(heap_tracker_t* heap_tracker)
 void
 memalloc_heap_tracker_init(uint32_t sample_size)
 {
-    heap_tracker_init(&global_heap_tracker);
-    global_heap_tracker.sample_size = sample_size;
-    global_heap_tracker.current_sample_size = heap_tracker_next_sample_size(sample_size);
+
+    if (memlock_trylock(&g_memheap_lock)) {
+        heap_tracker_init(&global_heap_tracker);
+        global_heap_tracker.sample_size = sample_size;
+        global_heap_tracker.current_sample_size = heap_tracker_next_sample_size(sample_size);
+        memlock_unlock(&g_memheap_lock);
+    }
 }
 
 void
 memalloc_heap_tracker_deinit(void)
 {
-    heap_tracker_wipe(&global_heap_tracker);
+    if (memlock_trylock(&g_memheap_lock)) {
+        heap_tracker_wipe(&global_heap_tracker);
+        memlock_unlock(&g_memheap_lock);
+    }
 }
 
 void
 memalloc_heap_untrack(void* ptr)
 {
+    if (!memlock_trylock(&g_memheap_lock)) {
+        return;
+    }
     if (global_heap_tracker.frozen) {
         /* Check that we still have space to store the free. If we don't have
            enough space, we ignore the untrack. That's sad as there is a change
@@ -144,6 +188,8 @@ memalloc_heap_untrack(void* ptr)
             ptr_array_append(&global_heap_tracker.freezer.frees, ptr);
     } else
         heap_tracker_untrack_thawed(&global_heap_tracker, ptr);
+
+    memlock_unlock(&g_memheap_lock);
 }
 
 /* Track a memory allocation in the heap profiler.
@@ -157,26 +203,36 @@ memalloc_heap_track(uint16_t max_nframe, void* ptr, size_t size, PyMemAllocatorD
         return false;
 
     /* Check for overflow */
-    global_heap_tracker.allocated_memory = Py_MIN(global_heap_tracker.allocated_memory + size, MAX_HEAP_SAMPLE_SIZE);
+    uint64_t res = atomic_add_clamped(&global_heap_tracker.allocated_memory, size, MAX_HEAP_SAMPLE_SIZE);
+    if (0 == res)
+        return false;
+
+    // Take the lock
+    if (!memlock_trylock(&g_memheap_lock)) {
+        return false;
+    }
 
     /* Check if we have enough sample or not */
-    if (global_heap_tracker.allocated_memory < global_heap_tracker.current_sample_size)
+    if (global_heap_tracker.allocated_memory < global_heap_tracker.current_sample_size) {
+        memlock_unlock(&g_memheap_lock);
         return false;
+    }
 
     /* Check if we can add more samples: the sum of the freezer + alloc tracker
      cannot be greater than what the alloc tracker can handle: when the alloc
      tracker is thawed, all the allocs in the freezer will be moved there!*/
-    if ((global_heap_tracker.freezer.allocs.count + global_heap_tracker.allocs.count) >= TRACEBACK_ARRAY_MAX_COUNT)
+    if (global_heap_tracker.freezer.allocs.count + global_heap_tracker.allocs.count >= TRACEBACK_ARRAY_MAX_COUNT) {
+        memlock_unlock(&g_memheap_lock);
         return false;
+    }
 
     /* Avoid loops */
-    if (memalloc_get_reentrant())
+    if (!memalloc_take_guard()) {
+        memlock_unlock(&g_memheap_lock);
         return false;
+    }
 
-    memalloc_set_reentrant(true);
     traceback_t* tb = memalloc_get_traceback(max_nframe, ptr, global_heap_tracker.allocated_memory, domain);
-    memalloc_set_reentrant(false);
-
     if (tb) {
         if (global_heap_tracker.frozen)
             traceback_array_append(&global_heap_tracker.freezer.allocs, tb);
@@ -189,15 +245,23 @@ memalloc_heap_track(uint16_t max_nframe, void* ptr, size_t size, PyMemAllocatorD
         /* Compute the new target sample size */
         global_heap_tracker.current_sample_size = heap_tracker_next_sample_size(global_heap_tracker.sample_size);
 
+        memalloc_yield_guard();
+        memlock_unlock(&g_memheap_lock);
         return true;
     }
 
+    memalloc_yield_guard();
+    memlock_unlock(&g_memheap_lock);
     return false;
 }
 
 PyObject*
 memalloc_heap()
 {
+    if (!memlock_trylock(&g_memheap_lock)) {
+        return NULL;
+    }
+
     heap_tracker_freeze(&global_heap_tracker);
 
     PyObject* heap_list = PyList_New(global_heap_tracker.allocs.count);
@@ -213,5 +277,6 @@ memalloc_heap()
 
     heap_tracker_thaw(&global_heap_tracker);
 
+    memlock_unlock(&g_memheap_lock);
     return heap_list;
 }
diff --git a/ddtrace/profiling/collector/_memalloc_reentrant.c b/ddtrace/profiling/collector/_memalloc_reentrant.c
new file mode 100644
index 00000000000..d360d19fb30
--- /dev/null
+++ b/ddtrace/profiling/collector/_memalloc_reentrant.c
@@ -0,0 +1,3 @@
+#include "_memalloc_reentrant.h"
+
+bool _MEMALLOC_ON_THREAD = false;
diff --git a/ddtrace/profiling/collector/_memalloc_reentrant.h b/ddtrace/profiling/collector/_memalloc_reentrant.h
index 5c8a552294e..cb4aa246961 100644
--- a/ddtrace/profiling/collector/_memalloc_reentrant.h
+++ b/ddtrace/profiling/collector/_memalloc_reentrant.h
@@ -1,50 +1,188 @@
 #ifndef _DDTRACE_MEMALLOC_REENTRANT_H
 #define _DDTRACE_MEMALLOC_REENTRANT_H
 
-#include "_pymacro.h"
+#ifdef _WIN32
+#include <windows.h>
+#else
+#define _POSIX_C_SOURCE 200809L
+#include <errno.h>
+#include <pthread.h>
+#include <stdatomic.h>
+#include <time.h>
+#include <unistd.h>
+#endif
 #include <stdbool.h>
+#include <stdint.h>
+#include <stdlib.h>
 
-#ifndef _PY37_AND_LATER
-#include <pythread.h>
+// Cross-platform macro for defining thread-local storage
+// NB - we use dynamic-global on Linux because the others are problematic
+#if defined(_MSC_VER) // Check for MSVC compiler
+#define MEMALLOC_TLS __declspec(thread)
+#elif defined(__GNUC__) || defined(__clang__) // GCC or Clang
+#define MEMALLOC_TLS __attribute__((tls_model("global-dynamic"))) __thread
+#else
+#error "Unsupported compiler for thread-local storage"
 #endif
+extern bool _MEMALLOC_ON_THREAD;
+
+// This is a saturating atomic add for 32- and 64-bit platforms.
+// In order to implement the saturation logic, use a CAS loop.
+// From the GCC docs:
+// "‘__atomic’ builtins can be used with any integral scalar or pointer type that is 1, 2, 4, or 8 bytes in length"
+// From the MSVC docs:
+// "_InterlockedCompareExchange64 is available on x86 systems running on any Pentium architecture; it is not
+// available on 386 or 486 architectures."
+static inline uint64_t
+atomic_add_clamped(uint64_t* target, uint64_t amount, uint64_t max)
+{
+    // In reality, there's virtually no scenario in which this deadlocks.  Just the same, give it some arbitrarily high
+    // limit in order to prevent unpredicted deadlocks.  96 is chosen since it's the number of cores on the largest
+    // consumer CPU generally used by our customers.
+    int attempts = 96;
+    while (attempts--) {
+        uint64_t old_val = (volatile uint64_t) * target;
 
-#ifdef _PY37_AND_LATER
-extern Py_tss_t memalloc_reentrant_key;
+        // CAS loop + saturation check
+        uint64_t new_val = old_val + amount;
+        if (new_val > max || new_val < old_val) {
+            return 0;
+        }
+#if defined(_MSC_VER)
+        uint64_t prev_val =
+          (uint64_t)InterlockedCompareExchange64((volatile LONG64*)target, (LONG64)new_val, (LONG64)old_val);
+        if (prev_val == old_val) {
+            return new_val;
+        }
+#elif defined(__clang__) || defined(__GNUC__)
+        if (atomic_compare_exchange_strong_explicit(
+              (_Atomic uint64_t*)target, &old_val, new_val, memory_order_seq_cst, memory_order_seq_cst)) {
+            return new_val;
+        }
 #else
-extern int memalloc_reentrant_key;
+#error "Unsupported compiler for atomic operations"
 #endif
+        // If we reach here, CAS failed; another thread changed `target`
+        // Retry until success or until we detect max.
+    }
 
-/* Any non-NULL pointer can be used */
-#define _MEMALLOC_REENTRANT_VALUE Py_True
+    return 0;
+}
 
-static inline void
-memalloc_set_reentrant(bool reentrant)
+// Opaque lock type
+typedef struct
+{
+#ifdef _WIN32
+    HANDLE mutex;
+#else
+    pthread_mutex_t mutex;
+#endif
+} memlock_t;
+
+// Global setting; if a lock fails to be acquired, crash
+static bool g_crash_on_mutex_pass = false;
+
+// Generic initializer
+static inline bool
+memlock_init(memlock_t* lock, bool crash_on_pass)
+{
+    if (!lock)
+        return false;
+
+    g_crash_on_mutex_pass = crash_on_pass;
+
+#ifdef _WIN32
+    lock->mutex = CreateMutex(NULL, FALSE, NULL);
+    return lock->mutex != NULL;
+#else
+    // For POSIX systems, we make sure to use an ERRORCHECK type mutex, since it pushes some of the state checking
+    // down to the implementation.
+    pthread_mutexattr_t attr;
+    pthread_mutexattr_init(&attr);
+    pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_ERRORCHECK);
+    return pthread_mutex_init(&lock->mutex, NULL) == 0;
+#endif
+}
+
+// Unlock function
+static inline bool
+memlock_unlock(memlock_t* lock)
 {
-    if (reentrant)
-#ifdef _PY37_AND_LATER
-        PyThread_tss_set(&memalloc_reentrant_key, _MEMALLOC_REENTRANT_VALUE);
+    if (!lock)
+        return false;
+
+#ifdef _WIN32
+    return ReleaseMutex(lock->mutex);
 #else
-        PyThread_set_key_value(memalloc_reentrant_key, _MEMALLOC_REENTRANT_VALUE);
+    return pthread_mutex_unlock(&lock->mutex) == 0;
+#endif
+}
+
+// trylock function
+static inline bool
+memlock_trylock(memlock_t* lock)
+{
+    if (!lock)
+        return false;
+
+#ifdef __linux__
+    // On Linux, we need to make sure we didn't just fork
+    // pthreads will guarantee the lock is consistent, but we at least need to clear it
+    static pid_t my_pid = 0;
+    if (my_pid == 0) {
+        my_pid = getpid();
+    } else if (my_pid != getpid()) {
+        // We've forked, so we need to free the lock
+        memlock_unlock(lock);
+        my_pid = getpid();
+    }
 #endif
-    else
-#ifdef _PY37_AND_LATER
-        PyThread_tss_set(&memalloc_reentrant_key, NULL);
+
+#ifdef _WIN32
+    bool result = WAIT_OBJECT_0 == WaitForSingleObject(lock->mutex, 0); // 0ms timeout -> no wait
 #else
-        PyThread_set_key_value(memalloc_reentrant_key, NULL);
+    bool result = 0 == pthread_mutex_trylock(&lock->mutex);
 #endif
+    if (!result && g_crash_on_mutex_pass) {
+        // segfault
+        int* p = NULL;
+        *p = 0;
+        abort(); // should never reach here
+    }
+
+    return result;
 }
 
+// Cleanup function
 static inline bool
-memalloc_get_reentrant(void)
+memlock_destroy(memlock_t* lock)
 {
-#ifdef _PY37_AND_LATER
-    if (PyThread_tss_get(&memalloc_reentrant_key))
+    if (!lock)
+        return false;
+
+#ifdef _WIN32
+    return CloseHandle(lock->mutex);
 #else
-    if (PyThread_get_key_value(memalloc_reentrant_key))
+    return 0 == pthread_mutex_destroy(&lock->mutex);
 #endif
-        return true;
+}
 
-    return false;
+static inline bool
+memalloc_take_guard()
+{
+    // Ordinarilly, a process-wide semaphore would require a CAS, but since this is thread-local we can just set it.
+    if (_MEMALLOC_ON_THREAD)
+        return false;
+    _MEMALLOC_ON_THREAD = true;
+    return true;
+}
+
+static inline void
+memalloc_yield_guard(void)
+{
+    // Ideally, we'd actually capture the old state within an object and restore it, but since this is
+    // a coarse-grained lock, we just set it to false.
+    _MEMALLOC_ON_THREAD = false;
 }
 
 #endif
diff --git a/ddtrace/profiling/collector/_memalloc_tb.c b/ddtrace/profiling/collector/_memalloc_tb.c
index ba79021f719..bb265fe08d5 100644
--- a/ddtrace/profiling/collector/_memalloc_tb.c
+++ b/ddtrace/profiling/collector/_memalloc_tb.c
@@ -87,6 +87,9 @@ memalloc_tb_deinit(void)
 void
 traceback_free(traceback_t* tb)
 {
+    if (!tb)
+        return;
+
     for (uint16_t nframe = 0; nframe < tb->nframe; nframe++) {
         Py_DECREF(tb->frames[nframe].filename);
         Py_DECREF(tb->frames[nframe].name);
@@ -197,11 +200,7 @@ memalloc_get_traceback(uint16_t max_nframe, void* ptr, size_t size, PyMemAllocat
     traceback->size = size;
     traceback->ptr = ptr;
 
-#ifdef _PY37_AND_LATER
     traceback->thread_id = PyThread_get_thread_ident();
-#else
-    traceback->thread_id = tstate->thread_id;
-#endif
 
     traceback->domain = domain;
 
diff --git a/ddtrace/profiling/collector/_pymacro.h b/ddtrace/profiling/collector/_pymacro.h
index e71ed6888b9..aa31c3d4cc1 100644
--- a/ddtrace/profiling/collector/_pymacro.h
+++ b/ddtrace/profiling/collector/_pymacro.h
@@ -13,8 +13,4 @@
 #define _PY38
 #endif
 
-#if PY_VERSION_HEX >= 0x03070000
-#define _PY37_AND_LATER
-#endif
-
 #endif
diff --git a/releasenotes/notes/fix-profiling-memalloc-segfault-5593ad951405a75d.yaml b/releasenotes/notes/fix-profiling-memalloc-segfault-5593ad951405a75d.yaml
new file mode 100644
index 00000000000..8632b62af50
--- /dev/null
+++ b/releasenotes/notes/fix-profiling-memalloc-segfault-5593ad951405a75d.yaml
@@ -0,0 +1,5 @@
+---
+fixes:
+  - |
+    Fixes an issue where the memory allocation profiler can cause a segmentation fault due to
+    data races when accessing its own global data structures from multiple threads.
diff --git a/setup.py b/setup.py
index 13b0cb4a4f0..6ed826c5920 100644
--- a/setup.py
+++ b/setup.py
@@ -510,8 +510,11 @@ def get_exts_for(name):
                 "ddtrace/profiling/collector/_memalloc.c",
                 "ddtrace/profiling/collector/_memalloc_tb.c",
                 "ddtrace/profiling/collector/_memalloc_heap.c",
+                "ddtrace/profiling/collector/_memalloc_reentrant.c",
             ],
-            extra_compile_args=debug_compile_args,
+            extra_compile_args=debug_compile_args + ["-D_POSIX_C_SOURCE=200809L", "-std=c11"]
+            if CURRENT_OS != "Windows"
+            else ["/std:c11"],
         ),
         Extension(
             "ddtrace.internal._threads",

From 45c64c63ac6f62b5f2aca229d2602be3e41c9b3b Mon Sep 17 00:00:00 2001
From: David Sanchez <838104+sanchda@users.noreply.github.com>
Date: Thu, 19 Dec 2024 18:00:29 -0500
Subject: [PATCH 2/4] fix(profiling): reset all profiling c++ mutexes on fork
 [backport 2.18] (#11768) (#11808)

## Checklist
- [X] PR author has checked that all the criteria below are met
- The PR description includes an overview of the change
- The PR description articulates the motivation for the change
- The change includes tests OR the PR description describes a testing
strategy
- The PR description notes risks associated with the change, if any
- Newly-added code is easy to change
- The change follows the [library release note
guidelines](https://ddtrace.readthedocs.io/en/stable/releasenotes.html)
- The change includes or references documentation updates if necessary
- Backport labels are set (if
[applicable](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting))

## Reviewer Checklist
- [X] Reviewer has checked that all the criteria below are met
- Title is accurate
- All changes are related to the pull request's stated goal
- Avoids breaking
[API](https://ddtrace.readthedocs.io/en/stable/versioning.html#interfaces)
changes
- Testing strategy adequately addresses listed risks
- Newly-added code is easy to change
- Release note makes sense to a user of the library
- If necessary, author has acknowledged and discussed the performance
implications of this PR as reported in the benchmarks PR comment
- Backport labels are set in a manner that is consistent with the
[release branch maintenance
policy](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting)

---------

Co-authored-by: Taegyun Kim <taegyun.kim@datadoghq.com>
---
 .../profiling/dd_wrapper/include/uploader_builder.hpp      | 2 ++
 .../datadog/profiling/dd_wrapper/src/code_provenance.cpp   | 3 +--
 .../datadog/profiling/dd_wrapper/src/ddup_interface.cpp    | 1 +
 .../internal/datadog/profiling/dd_wrapper/src/profile.cpp  | 2 +-
 .../internal/datadog/profiling/dd_wrapper/src/uploader.cpp | 3 ++-
 .../datadog/profiling/dd_wrapper/src/uploader_builder.cpp  | 7 +++++++
 .../internal/datadog/profiling/stack_v2/src/sampler.cpp    | 5 +++++
 .../datadog/profiling/stack_v2/src/thread_span_links.cpp   | 4 +---
 .../fix-profiling-native-mutices-62440b5a3d9d6c4b.yaml     | 5 +++++
 9 files changed, 25 insertions(+), 7 deletions(-)
 create mode 100644 releasenotes/notes/fix-profiling-native-mutices-62440b5a3d9d6c4b.yaml

diff --git a/ddtrace/internal/datadog/profiling/dd_wrapper/include/uploader_builder.hpp b/ddtrace/internal/datadog/profiling/dd_wrapper/include/uploader_builder.hpp
index 62ee6aad853..7077096c744 100644
--- a/ddtrace/internal/datadog/profiling/dd_wrapper/include/uploader_builder.hpp
+++ b/ddtrace/internal/datadog/profiling/dd_wrapper/include/uploader_builder.hpp
@@ -43,6 +43,8 @@ class UploaderBuilder
     static void set_output_filename(std::string_view _output_filename);
 
     static std::variant<Uploader, std::string> build();
+
+    static void postfork_child();
 };
 
 } // namespace Datadog
diff --git a/ddtrace/internal/datadog/profiling/dd_wrapper/src/code_provenance.cpp b/ddtrace/internal/datadog/profiling/dd_wrapper/src/code_provenance.cpp
index 0a4a49a4ce5..f3147cd2034 100644
--- a/ddtrace/internal/datadog/profiling/dd_wrapper/src/code_provenance.cpp
+++ b/ddtrace/internal/datadog/profiling/dd_wrapper/src/code_provenance.cpp
@@ -14,9 +14,8 @@ namespace Datadog {
 void
 Datadog::CodeProvenance::postfork_child()
 {
-    get_instance().mtx.~mutex();            // Destroy the mutex
+    // NB placement-new to re-init and leak the mutex because doing anything else is UB
     new (&get_instance().mtx) std::mutex(); // Recreate the mutex
-    get_instance().reset();                 // Reset the state
 }
 
 void
diff --git a/ddtrace/internal/datadog/profiling/dd_wrapper/src/ddup_interface.cpp b/ddtrace/internal/datadog/profiling/dd_wrapper/src/ddup_interface.cpp
index baee51a7eda..e34bb7e8695 100644
--- a/ddtrace/internal/datadog/profiling/dd_wrapper/src/ddup_interface.cpp
+++ b/ddtrace/internal/datadog/profiling/dd_wrapper/src/ddup_interface.cpp
@@ -24,6 +24,7 @@ ddup_postfork_child()
     Datadog::Uploader::postfork_child();
     Datadog::SampleManager::postfork_child();
     Datadog::CodeProvenance::postfork_child();
+    Datadog::UploaderBuilder::postfork_child();
 }
 
 void
diff --git a/ddtrace/internal/datadog/profiling/dd_wrapper/src/profile.cpp b/ddtrace/internal/datadog/profiling/dd_wrapper/src/profile.cpp
index f9f7a3e9585..7a6c69964ae 100644
--- a/ddtrace/internal/datadog/profiling/dd_wrapper/src/profile.cpp
+++ b/ddtrace/internal/datadog/profiling/dd_wrapper/src/profile.cpp
@@ -186,6 +186,6 @@ Datadog::Profile::collect(const ddog_prof_Sample& sample, int64_t endtime_ns)
 void
 Datadog::Profile::postfork_child()
 {
-    profile_mtx.unlock();
+    new (&profile_mtx) std::mutex();
     cycle_buffers();
 }
diff --git a/ddtrace/internal/datadog/profiling/dd_wrapper/src/uploader.cpp b/ddtrace/internal/datadog/profiling/dd_wrapper/src/uploader.cpp
index 375c2e09e9e..325771946d8 100644
--- a/ddtrace/internal/datadog/profiling/dd_wrapper/src/uploader.cpp
+++ b/ddtrace/internal/datadog/profiling/dd_wrapper/src/uploader.cpp
@@ -175,5 +175,6 @@ Datadog::Uploader::postfork_parent()
 void
 Datadog::Uploader::postfork_child()
 {
-    unlock();
+    // NB placement-new to re-init and leak the mutex because doing anything else is UB
+    new (&upload_lock) std::mutex();
 }
diff --git a/ddtrace/internal/datadog/profiling/dd_wrapper/src/uploader_builder.cpp b/ddtrace/internal/datadog/profiling/dd_wrapper/src/uploader_builder.cpp
index 0661b7f217f..8ff5d45e7c2 100644
--- a/ddtrace/internal/datadog/profiling/dd_wrapper/src/uploader_builder.cpp
+++ b/ddtrace/internal/datadog/profiling/dd_wrapper/src/uploader_builder.cpp
@@ -186,3 +186,10 @@ Datadog::UploaderBuilder::build()
 
     return Datadog::Uploader{ output_filename, ddog_exporter };
 }
+
+void
+Datadog::UploaderBuilder::postfork_child()
+{
+    // NB placement-new to re-init and leak the mutex because doing anything else is UB
+    new (&tag_mutex) std::mutex();
+}
diff --git a/ddtrace/internal/datadog/profiling/stack_v2/src/sampler.cpp b/ddtrace/internal/datadog/profiling/stack_v2/src/sampler.cpp
index c05ae45477e..7ad9ad692b2 100644
--- a/ddtrace/internal/datadog/profiling/stack_v2/src/sampler.cpp
+++ b/ddtrace/internal/datadog/profiling/stack_v2/src/sampler.cpp
@@ -67,6 +67,11 @@ _stack_v2_atfork_child()
     // so we don't even reveal this function to the user
     _set_pid(getpid());
     ThreadSpanLinks::postfork_child();
+
+    // `thread_info_map_lock` and `task_link_map_lock` are global locks held in echion
+    // NB placement-new to re-init and leak the mutex because doing anything else is UB
+    new (&thread_info_map_lock) std::mutex;
+    new (&task_link_map_lock) std::mutex;
 }
 
 __attribute__((constructor)) void
diff --git a/ddtrace/internal/datadog/profiling/stack_v2/src/thread_span_links.cpp b/ddtrace/internal/datadog/profiling/stack_v2/src/thread_span_links.cpp
index c777ff8a510..6be43a04a42 100644
--- a/ddtrace/internal/datadog/profiling/stack_v2/src/thread_span_links.cpp
+++ b/ddtrace/internal/datadog/profiling/stack_v2/src/thread_span_links.cpp
@@ -53,10 +53,8 @@ ThreadSpanLinks::reset()
 void
 ThreadSpanLinks::postfork_child()
 {
-    // Explicitly destroy and reconstruct the mutex to avoid undefined behavior
-    get_instance().mtx.~mutex();
+    // NB placement-new to re-init and leak the mutex because doing anything else is UB
     new (&get_instance().mtx) std::mutex();
-
     get_instance().reset();
 }
 
diff --git a/releasenotes/notes/fix-profiling-native-mutices-62440b5a3d9d6c4b.yaml b/releasenotes/notes/fix-profiling-native-mutices-62440b5a3d9d6c4b.yaml
new file mode 100644
index 00000000000..40167a974c3
--- /dev/null
+++ b/releasenotes/notes/fix-profiling-native-mutices-62440b5a3d9d6c4b.yaml
@@ -0,0 +1,5 @@
+---
+fixes:
+  - |
+    profiling: Fixes a bug where profiling mutexes were not cleared on fork in the child process.  This could
+    cause deadlocks in certain configurations.

From 0d6c79bbe374e780ec07663c0d55613e8db81d18 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Thu, 19 Dec 2024 23:41:04 +0000
Subject: [PATCH 3/4] fix(celery): stop closing prerun_span too soon to account
 for Celery chains scenario [backport 2.18] (#11806)

Backport e8aab659df2df0769586856bdf9f3eaefcfbbb5b from #11498 to 2.18.

We've made a few changes to handle celery context recently, including:
https://github.com/DataDog/dd-trace-py/pull/10676

In particular the goal of
https://github.com/DataDog/dd-trace-py/pull/10676 was to handle a
scenario where a long running task may run into an exception, preventing
it from closing.

Unfortunately, this scenario did not account for cases where tasks are
chained and may not close until later.

See: https://github.com/DataDog/dd-trace-py/issues/11479 and
https://github.com/DataDog/dd-trace-py/issues/11624

With this PR, the sample app in
https://github.com/DataDog/dd-trace-py/issues/11479 would attach the
celery specific span back to the root span.

I also need to add tests for the chains scenario.

Related to AIDM-494

## Checklist
- [x] PR author has checked that all the criteria below are met
- The PR description includes an overview of the change
- The PR description articulates the motivation for the change
- The change includes tests OR the PR description describes a testing
strategy
- The PR description notes risks associated with the change, if any
- Newly-added code is easy to change
- The change follows the [library release note
guidelines](https://ddtrace.readthedocs.io/en/stable/releasenotes.html)
- The change includes or references documentation updates if necessary
- Backport labels are set (if
[applicable](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting))

## Reviewer Checklist
- [x] Reviewer has checked that all the criteria below are met
- Title is accurate
- All changes are related to the pull request's stated goal
- Avoids breaking
[API](https://ddtrace.readthedocs.io/en/stable/versioning.html#interfaces)
changes
- Testing strategy adequately addresses listed risks
- Newly-added code is easy to change
- Release note makes sense to a user of the library
- If necessary, author has acknowledged and discussed the performance
implications of this PR as reported in the benchmarks PR comment
- Backport labels are set in a manner that is consistent with the
[release branch maintenance
policy](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting)

Co-authored-by: wantsui <wan.tsui@datadoghq.com>
---
 ddtrace/contrib/internal/celery/app.py        | 11 ----
 ddtrace/contrib/internal/celery/signals.py    |  3 -
 ...-celery-closed-spans-34ff43868c1e33b8.yaml |  4 ++
 tests/contrib/celery/run_tasks.py             |  5 ++
 tests/contrib/celery/tasks.py                 | 14 +++++
 tests/contrib/celery/test_chained_task.py     | 62 +++++++++++++++++++
 6 files changed, 85 insertions(+), 14 deletions(-)
 create mode 100644 releasenotes/notes/fix-celery-closed-spans-34ff43868c1e33b8.yaml
 create mode 100644 tests/contrib/celery/run_tasks.py
 create mode 100644 tests/contrib/celery/tasks.py
 create mode 100644 tests/contrib/celery/test_chained_task.py

diff --git a/ddtrace/contrib/internal/celery/app.py b/ddtrace/contrib/internal/celery/app.py
index b61585097a7..42eed2cb468 100644
--- a/ddtrace/contrib/internal/celery/app.py
+++ b/ddtrace/contrib/internal/celery/app.py
@@ -133,10 +133,6 @@ def _traced_apply_async_inner(func, instance, args, kwargs):
                 if task_span:
                     task_span.set_exc_info(*sys.exc_info())
 
-                prerun_span = core.get_item("prerun_span")
-                if prerun_span:
-                    prerun_span.set_exc_info(*sys.exc_info())
-
                 raise
             finally:
                 task_span = core.get_item("task_span")
@@ -147,11 +143,4 @@ def _traced_apply_async_inner(func, instance, args, kwargs):
                     )
                     task_span.finish()
 
-                prerun_span = core.get_item("prerun_span")
-                if prerun_span:
-                    log.debug(
-                        "The task_postrun signal was not called, so manually closing span: %s", prerun_span._pprint()
-                    )
-                    prerun_span.finish()
-
     return _traced_apply_async_inner
diff --git a/ddtrace/contrib/internal/celery/signals.py b/ddtrace/contrib/internal/celery/signals.py
index 76f07ee7524..8f27fcc53b0 100644
--- a/ddtrace/contrib/internal/celery/signals.py
+++ b/ddtrace/contrib/internal/celery/signals.py
@@ -54,9 +54,6 @@ def trace_prerun(*args, **kwargs):
     service = config.celery["worker_service_name"]
     span = pin.tracer.trace(c.WORKER_ROOT_SPAN, service=service, resource=task.name, span_type=SpanTypes.WORKER)
 
-    # Store an item called "prerun span" in case task_postrun doesn't get called
-    core.set_item("prerun_span", span)
-
     # set span.kind to the type of request being performed
     span.set_tag_str(SPAN_KIND, SpanKind.CONSUMER)
 
diff --git a/releasenotes/notes/fix-celery-closed-spans-34ff43868c1e33b8.yaml b/releasenotes/notes/fix-celery-closed-spans-34ff43868c1e33b8.yaml
new file mode 100644
index 00000000000..f16f7b36fed
--- /dev/null
+++ b/releasenotes/notes/fix-celery-closed-spans-34ff43868c1e33b8.yaml
@@ -0,0 +1,4 @@
+---
+fixes:
+  - |
+    tracing(celery): Fixes an issue where ``celery.apply`` spans from Celery prerun got closed too soon leading to span tags being missing.
\ No newline at end of file
diff --git a/tests/contrib/celery/run_tasks.py b/tests/contrib/celery/run_tasks.py
new file mode 100644
index 00000000000..e91454ab5bb
--- /dev/null
+++ b/tests/contrib/celery/run_tasks.py
@@ -0,0 +1,5 @@
+from tasks import fn_a
+from tasks import fn_b
+
+
+(fn_a.si() | fn_b.si()).delay()
diff --git a/tests/contrib/celery/tasks.py b/tests/contrib/celery/tasks.py
new file mode 100644
index 00000000000..a9dfc936ae4
--- /dev/null
+++ b/tests/contrib/celery/tasks.py
@@ -0,0 +1,14 @@
+from celery import Celery
+
+
+app = Celery("tasks")
+
+
+@app.task(name="tests.contrib.celery.tasks.fn_a")
+def fn_a():
+    return "a"
+
+
+@app.task(name="tests.contrib.celery.tasks.fn_b")
+def fn_b():
+    return "b"
diff --git a/tests/contrib/celery/test_chained_task.py b/tests/contrib/celery/test_chained_task.py
new file mode 100644
index 00000000000..5fd0c543e72
--- /dev/null
+++ b/tests/contrib/celery/test_chained_task.py
@@ -0,0 +1,62 @@
+import os
+import re
+import subprocess
+import time
+
+from celery import Celery
+
+
+# Ensure that when we call Celery chains, the root span has celery specific span tags
+# The test_integration.py setup doesn't perfectly mimic the condition of a worker process running.
+# This test runs the worker as a side so we can check the tracer logs afterwards to ensure expected span results.
+# See https://github.com/DataDog/dd-trace-py/issues/11479
+def test_task_chain_task_call_task():
+    app = Celery("tasks")
+
+    celery_worker_cmd = "ddtrace-run celery -A tasks worker -c 1 -l DEBUG -n uniquename1 -P solo"
+    celery_task_runner_cmd = "ddtrace-run python run_tasks.py"
+
+    # The commands need to run from the directory where this test file lives
+    current_directory = str(os.path.dirname(__file__))
+
+    worker_process = subprocess.Popen(
+        celery_worker_cmd.split(),
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE,
+        preexec_fn=os.setsid,
+        close_fds=True,
+        cwd=current_directory,
+    )
+
+    max_wait_time = 10
+    waited_so_far = 0
+    # {app.control.inspect().active() returns {'celery@uniquename1': []} when the worker is running}
+    while app.control.inspect().active() is None and waited_so_far < max_wait_time:
+        time.sleep(1)
+        waited_so_far += 1
+
+    # The task should only run after the Celery worker has sufficient time to start up
+    task_runner_process = subprocess.Popen(
+        celery_task_runner_cmd.split(),
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE,
+        preexec_fn=os.setsid,
+        close_fds=True,
+        cwd=current_directory,
+    )
+
+    task_runner_process.wait()
+    # Kill the process so it starts to send traces to the Trace Agent
+    worker_process.kill()
+    worker_logs = worker_process.stderr.read()
+
+    # Check that the root span was created with one of the Celery specific tags, such as celery.correlation_id
+    # Some versions of python seem to require escaping when using `re.search`:
+    old_pattern_match = r"resource=\\'tests.contrib.celery.tasks.fn_a\\' type=\\'worker\\' .* tags=.*correlation_id.*"
+    new_pattern_match = r"resource=\'tests.contrib.celery.tasks.fn_a\' type=\'worker\' .* tags=.*correlation_id.*"
+
+    pattern_exists = (
+        re.search(old_pattern_match, str(worker_logs)) is not None
+        or re.search(new_pattern_match, str(worker_logs)) is not None
+    )
+    assert pattern_exists is not None

From c0bb6bd287cdfca5f92c981749aada7d7fe355ac Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Tue, 31 Dec 2024 11:30:50 +0100
Subject: [PATCH 4/4] fix: error parsing response cookies in FastAPI and awsgi
 [backport 2.18] (#11837)

Backport e29ccb0a68bc73bf3d7f071e66e1c8a20eb0c801 from #11829 to 2.18.

This fix resolves an issue parsing response cookies in FastAPI and awsgi

issue: https://github.com/DataDog/dd-trace-py/issues/11818

## Checklist
- [x] PR author has checked that all the criteria below are met
- The PR description includes an overview of the change
- The PR description articulates the motivation for the change
- The change includes tests OR the PR description describes a testing
strategy
- The PR description notes risks associated with the change, if any
- Newly-added code is easy to change
- The change follows the [library release note
guidelines](https://ddtrace.readthedocs.io/en/stable/releasenotes.html)
- The change includes or references documentation updates if necessary
- Backport labels are set (if
[applicable](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting))

## Reviewer Checklist
- [x] Reviewer has checked that all the criteria below are met
- Title is accurate
- All changes are related to the pull request's stated goal
- Avoids breaking
[API](https://ddtrace.readthedocs.io/en/stable/versioning.html#interfaces)
changes
- Testing strategy adequately addresses listed risks
- Newly-added code is easy to change
- Release note makes sense to a user of the library
- If necessary, author has acknowledged and discussed the performance
implications of this PR as reported in the benchmarks PR comment
- Backport labels are set in a manner that is consistent with the
[release branch maintenance
policy](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting)

Co-authored-by: Alberto Vara <alberto.vara@datadoghq.com>
---
 ddtrace/contrib/internal/asgi/middleware.py   | 22 +++++----
 .../iast-fix-awsgi-368c173e1f012400.yaml      |  4 ++
 tests/contrib/asgi/test_asgi.py               | 48 +++++++++++++++++++
 3 files changed, 65 insertions(+), 9 deletions(-)
 create mode 100644 releasenotes/notes/iast-fix-awsgi-368c173e1f012400.yaml

diff --git a/ddtrace/contrib/internal/asgi/middleware.py b/ddtrace/contrib/internal/asgi/middleware.py
index e32d2994a3d..98d352cf75f 100644
--- a/ddtrace/contrib/internal/asgi/middleware.py
+++ b/ddtrace/contrib/internal/asgi/middleware.py
@@ -91,6 +91,18 @@ async def _blocked_asgi_app(scope, receive, send):
     await send({"type": "http.response.body", "body": b""})
 
 
+def _parse_response_cookies(response_headers):
+    cookies = {}
+    try:
+        result = response_headers.get("set-cookie", "").split("=", maxsplit=1)
+        if len(result) == 2:
+            cookie_key, cookie_value = result
+            cookies[cookie_key] = cookie_value
+    except Exception:
+        log.debug("failed to extract response cookies", exc_info=True)
+    return cookies
+
+
 class TraceMiddleware:
     """
     ASGI application middleware that traces the requests.
@@ -211,7 +223,6 @@ async def __call__(self, scope, receive, send):
                 peer_ip = client[0]
             else:
                 peer_ip = None
-
             trace_utils.set_http_meta(
                 span,
                 self.integration_config,
@@ -234,15 +245,8 @@ async def wrapped_send(message):
                 except Exception:
                     log.warning("failed to extract response headers", exc_info=True)
                     response_headers = None
-
                 if span and message.get("type") == "http.response.start" and "status" in message:
-                    cookies = {}
-                    try:
-                        cookie_key, cookie_value = response_headers.get("set-cookie", "").split("=", maxsplit=1)
-                        cookies[cookie_key] = cookie_value
-                    except Exception:
-                        log.debug("failed to extract response cookies", exc_info=True)
-
+                    cookies = _parse_response_cookies(response_headers)
                     status_code = message["status"]
                     trace_utils.set_http_meta(
                         span,
diff --git a/releasenotes/notes/iast-fix-awsgi-368c173e1f012400.yaml b/releasenotes/notes/iast-fix-awsgi-368c173e1f012400.yaml
new file mode 100644
index 00000000000..4d40945744a
--- /dev/null
+++ b/releasenotes/notes/iast-fix-awsgi-368c173e1f012400.yaml
@@ -0,0 +1,4 @@
+---
+fixes:
+  - |
+    ASGI: This fix resolves an issue parsing response cookies in FastAPI and awsgi
diff --git a/tests/contrib/asgi/test_asgi.py b/tests/contrib/asgi/test_asgi.py
index 40935990fc2..c1e1f1c7328 100644
--- a/tests/contrib/asgi/test_asgi.py
+++ b/tests/contrib/asgi/test_asgi.py
@@ -1,5 +1,6 @@
 import asyncio
 from functools import partial
+import logging
 import os
 import random
 
@@ -10,6 +11,7 @@
 from ddtrace.constants import ERROR_MSG
 from ddtrace.contrib.asgi import TraceMiddleware
 from ddtrace.contrib.asgi import span_from_scope
+from ddtrace.contrib.internal.asgi.middleware import _parse_response_cookies
 from ddtrace.propagation import http as http_propagation
 from tests.conftest import DEFAULT_DDTRACE_SUBPROCESS_TEST_SERVICE_NAME
 from tests.utils import DummyTracer
@@ -634,6 +636,52 @@ async def test_tasks_asgi_without_more_body(scope, tracer, test_spans):
     assert request_span.duration < 1
 
 
+@pytest.mark.asyncio
+async def test_request_parse_response_cookies(tracer, test_spans, caplog):
+    """
+    Regression test https://github.com/DataDog/dd-trace-py/issues/11818
+    """
+
+    async def tasks_cookies(scope, receive, send):
+        message = await receive()
+        if message.get("type") == "http.request":
+            await send({"type": "http.response.start", "status": 200, "headers": [[b"set-cookie", b"test_cookie"]]})
+            await send({"type": "http.response.body", "body": b"*"})
+            await asyncio.sleep(1)
+
+    with caplog.at_level(logging.DEBUG):
+        app = TraceMiddleware(tasks_cookies, tracer=tracer)
+        async with httpx.AsyncClient(app=app) as client:
+            response = await client.get("http://testserver/")
+            assert response.status_code == 200
+
+    assert "failed to extract response cookies" not in caplog.text
+
+
+@pytest.mark.parametrize(
+    "headers,expected_result",
+    [
+        ({}, {}),
+        ({"cookie": "cookie1=value1"}, {}),
+        ({"header-1": ""}, {}),
+        ({"Set-cookie": "cookie1=value1"}, {}),
+        ({"set-Cookie": "cookie1=value1"}, {}),
+        ({"SET-cookie": "cookie1=value1"}, {}),
+        ({"set-cookie": "a"}, {}),
+        ({"set-cookie": "1234"}, {}),
+        ({"set-cookie": "cookie1=value1"}, {"cookie1": "value1"}),
+        ({"set-cookie": "cookie2=value1=value2"}, {"cookie2": "value1=value2"}),
+        ({"set-cookie": "cookie3=="}, {"cookie3": "="}),
+    ],
+)
+def test__parse_response_cookies(headers, expected_result, caplog):
+    with caplog.at_level(logging.DEBUG):
+        result = _parse_response_cookies(headers)
+
+    assert "failed to extract response cookies" not in caplog.text
+    assert result == expected_result
+
+
 @pytest.mark.asyncio
 async def test_tasks_asgi_with_more_body(scope, tracer, test_spans):
     """