|
79 | 79 | "<img src=\"https://sebastianraschka.com/images/LLMs-from-scratch-images/ch06_compressed/chapter-overview.webp\" width=500px>"
|
80 | 80 | ]
|
81 | 81 | },
|
| 82 | + { |
| 83 | + "cell_type": "code", |
| 84 | + "execution_count": 2, |
| 85 | + "id": "946c3e56-b04b-4b0f-b35f-b485ce5b28df", |
| 86 | + "metadata": {}, |
| 87 | + "outputs": [], |
| 88 | + "source": [ |
| 89 | + "# Utility to prevent certain cells from being executed twice\n", |
| 90 | + "\n", |
| 91 | + "from IPython.core.magic import register_line_cell_magic\n", |
| 92 | + "\n", |
| 93 | + "executed_cells = set()\n", |
| 94 | + "\n", |
| 95 | + "@register_line_cell_magic\n", |
| 96 | + "def run_once(line, cell):\n", |
| 97 | + " if line not in executed_cells:\n", |
| 98 | + " get_ipython().run_cell(cell)\n", |
| 99 | + " executed_cells.add(line)\n", |
| 100 | + " else:\n", |
| 101 | + " print(f\"Cell '{line}' has already been executed.\")" |
| 102 | + ] |
| 103 | + }, |
82 | 104 | {
|
83 | 105 | "cell_type": "markdown",
|
84 | 106 | "id": "3a84cf35-b37f-4c15-8972-dfafc9fadc1c",
|
|
167 | 189 | },
|
168 | 190 | {
|
169 | 191 | "cell_type": "code",
|
170 |
| - "execution_count": 2, |
| 192 | + "execution_count": 3, |
171 | 193 | "id": "def7c09b-af9c-4216-90ce-5e67aed1065c",
|
172 | 194 | "metadata": {
|
173 | 195 | "colab": {
|
|
181 | 203 | "name": "stdout",
|
182 | 204 | "output_type": "stream",
|
183 | 205 | "text": [
|
184 |
| - "File downloaded and saved as sms_spam_collection/SMSSpamCollection.tsv\n" |
| 206 | + "sms_spam_collection/SMSSpamCollection.tsv already exists. Skipping download and extraction.\n" |
185 | 207 | ]
|
186 | 208 | }
|
187 | 209 | ],
|
|
230 | 252 | },
|
231 | 253 | {
|
232 | 254 | "cell_type": "code",
|
233 |
| - "execution_count": 3, |
| 255 | + "execution_count": 4, |
234 | 256 | "id": "da0ed4da-ac31-4e4d-8bdd-2153be4656a4",
|
235 | 257 | "metadata": {
|
236 | 258 | "colab": {
|
|
344 | 366 | "[5572 rows x 2 columns]"
|
345 | 367 | ]
|
346 | 368 | },
|
347 |
| - "execution_count": 3, |
| 369 | + "execution_count": 4, |
348 | 370 | "metadata": {},
|
349 | 371 | "output_type": "execute_result"
|
350 | 372 | }
|
|
368 | 390 | },
|
369 | 391 | {
|
370 | 392 | "cell_type": "code",
|
371 |
| - "execution_count": 4, |
| 393 | + "execution_count": 5, |
372 | 394 | "id": "495a5280-9d7c-41d4-9719-64ab99056d4c",
|
373 | 395 | "metadata": {
|
374 | 396 | "colab": {
|
|
406 | 428 | },
|
407 | 429 | {
|
408 | 430 | "cell_type": "code",
|
409 |
| - "execution_count": 5, |
| 431 | + "execution_count": 6, |
410 | 432 | "id": "7be4a0a2-9704-4a96-b38f-240339818688",
|
411 | 433 | "metadata": {
|
412 | 434 | "colab": {
|
|
428 | 450 | }
|
429 | 451 | ],
|
430 | 452 | "source": [
|
| 453 | + "%%run_once balance_df\n", |
| 454 | + "\n", |
| 455 | + "\n", |
431 | 456 | "def create_balanced_dataset(df):\n",
|
432 | 457 | " \n",
|
433 | 458 | " # Count the instances of \"spam\"\n",
|
|
441 | 466 | "\n",
|
442 | 467 | " return balanced_df\n",
|
443 | 468 | "\n",
|
| 469 | + "\n", |
444 | 470 | "balanced_df = create_balanced_dataset(df)\n",
|
445 | 471 | "print(balanced_df[\"Label\"].value_counts())"
|
446 | 472 | ]
|
|
457 | 483 | },
|
458 | 484 | {
|
459 | 485 | "cell_type": "code",
|
460 |
| - "execution_count": 6, |
| 486 | + "execution_count": 7, |
461 | 487 | "id": "c1b10c3d-5d57-42d0-8de8-cf80a06f5ffd",
|
462 | 488 | "metadata": {
|
463 | 489 | "id": "c1b10c3d-5d57-42d0-8de8-cf80a06f5ffd"
|
464 | 490 | },
|
465 | 491 | "outputs": [],
|
466 | 492 | "source": [
|
467 |
| - "balanced_df[\"Label\"] = balanced_df[\"Label\"].map({\"ham\": 0, \"spam\": 1})" |
| 493 | + "%%run_once label_mapping\n", |
| 494 | + "balanced_df[\"Label\"] = balanced_df[\"Label\"].map({\"ham\": 0, \"spam\": 1}) " |
| 495 | + ] |
| 496 | + }, |
| 497 | + { |
| 498 | + "cell_type": "code", |
| 499 | + "execution_count": 8, |
| 500 | + "id": "e6f7f062-ef4e-4020-8275-71990cab4414", |
| 501 | + "metadata": {}, |
| 502 | + "outputs": [ |
| 503 | + { |
| 504 | + "data": { |
| 505 | + "text/html": [ |
| 506 | + "<div>\n", |
| 507 | + "<style scoped>\n", |
| 508 | + " .dataframe tbody tr th:only-of-type {\n", |
| 509 | + " vertical-align: middle;\n", |
| 510 | + " }\n", |
| 511 | + "\n", |
| 512 | + " .dataframe tbody tr th {\n", |
| 513 | + " vertical-align: top;\n", |
| 514 | + " }\n", |
| 515 | + "\n", |
| 516 | + " .dataframe thead th {\n", |
| 517 | + " text-align: right;\n", |
| 518 | + " }\n", |
| 519 | + "</style>\n", |
| 520 | + "<table border=\"1\" class=\"dataframe\">\n", |
| 521 | + " <thead>\n", |
| 522 | + " <tr style=\"text-align: right;\">\n", |
| 523 | + " <th></th>\n", |
| 524 | + " <th>Label</th>\n", |
| 525 | + " <th>Text</th>\n", |
| 526 | + " </tr>\n", |
| 527 | + " </thead>\n", |
| 528 | + " <tbody>\n", |
| 529 | + " <tr>\n", |
| 530 | + " <th>4307</th>\n", |
| 531 | + " <td>0</td>\n", |
| 532 | + " <td>Awww dat is sweet! We can think of something t...</td>\n", |
| 533 | + " </tr>\n", |
| 534 | + " <tr>\n", |
| 535 | + " <th>4138</th>\n", |
| 536 | + " <td>0</td>\n", |
| 537 | + " <td>Just got to &lt;#&gt;</td>\n", |
| 538 | + " </tr>\n", |
| 539 | + " <tr>\n", |
| 540 | + " <th>4831</th>\n", |
| 541 | + " <td>0</td>\n", |
| 542 | + " <td>The word \"Checkmate\" in chess comes from the P...</td>\n", |
| 543 | + " </tr>\n", |
| 544 | + " <tr>\n", |
| 545 | + " <th>4461</th>\n", |
| 546 | + " <td>0</td>\n", |
| 547 | + " <td>This is wishing you a great day. Moji told me ...</td>\n", |
| 548 | + " </tr>\n", |
| 549 | + " <tr>\n", |
| 550 | + " <th>5440</th>\n", |
| 551 | + " <td>0</td>\n", |
| 552 | + " <td>Thank you. do you generally date the brothas?</td>\n", |
| 553 | + " </tr>\n", |
| 554 | + " <tr>\n", |
| 555 | + " <th>...</th>\n", |
| 556 | + " <td>...</td>\n", |
| 557 | + " <td>...</td>\n", |
| 558 | + " </tr>\n", |
| 559 | + " <tr>\n", |
| 560 | + " <th>5537</th>\n", |
| 561 | + " <td>1</td>\n", |
| 562 | + " <td>Want explicit SEX in 30 secs? Ring 02073162414...</td>\n", |
| 563 | + " </tr>\n", |
| 564 | + " <tr>\n", |
| 565 | + " <th>5540</th>\n", |
| 566 | + " <td>1</td>\n", |
| 567 | + " <td>ASKED 3MOBILE IF 0870 CHATLINES INCLU IN FREE ...</td>\n", |
| 568 | + " </tr>\n", |
| 569 | + " <tr>\n", |
| 570 | + " <th>5547</th>\n", |
| 571 | + " <td>1</td>\n", |
| 572 | + " <td>Had your contract mobile 11 Mnths? Latest Moto...</td>\n", |
| 573 | + " </tr>\n", |
| 574 | + " <tr>\n", |
| 575 | + " <th>5566</th>\n", |
| 576 | + " <td>1</td>\n", |
| 577 | + " <td>REMINDER FROM O2: To get 2.50 pounds free call...</td>\n", |
| 578 | + " </tr>\n", |
| 579 | + " <tr>\n", |
| 580 | + " <th>5567</th>\n", |
| 581 | + " <td>1</td>\n", |
| 582 | + " <td>This is the 2nd time we have tried 2 contact u...</td>\n", |
| 583 | + " </tr>\n", |
| 584 | + " </tbody>\n", |
| 585 | + "</table>\n", |
| 586 | + "<p>1494 rows × 2 columns</p>\n", |
| 587 | + "</div>" |
| 588 | + ], |
| 589 | + "text/plain": [ |
| 590 | + " Label Text\n", |
| 591 | + "4307 0 Awww dat is sweet! We can think of something t...\n", |
| 592 | + "4138 0 Just got to <#>\n", |
| 593 | + "4831 0 The word \"Checkmate\" in chess comes from the P...\n", |
| 594 | + "4461 0 This is wishing you a great day. Moji told me ...\n", |
| 595 | + "5440 0 Thank you. do you generally date the brothas?\n", |
| 596 | + "... ... ...\n", |
| 597 | + "5537 1 Want explicit SEX in 30 secs? Ring 02073162414...\n", |
| 598 | + "5540 1 ASKED 3MOBILE IF 0870 CHATLINES INCLU IN FREE ...\n", |
| 599 | + "5547 1 Had your contract mobile 11 Mnths? Latest Moto...\n", |
| 600 | + "5566 1 REMINDER FROM O2: To get 2.50 pounds free call...\n", |
| 601 | + "5567 1 This is the 2nd time we have tried 2 contact u...\n", |
| 602 | + "\n", |
| 603 | + "[1494 rows x 2 columns]" |
| 604 | + ] |
| 605 | + }, |
| 606 | + "execution_count": 8, |
| 607 | + "metadata": {}, |
| 608 | + "output_type": "execute_result" |
| 609 | + } |
| 610 | + ], |
| 611 | + "source": [ |
| 612 | + "balanced_df" |
468 | 613 | ]
|
469 | 614 | },
|
470 | 615 | {
|
|
0 commit comments