|
366 | 366 | "predicted_embeddings.head() " |
367 | 367 | ] |
368 | 368 | }, |
369 | | - { |
370 | | - "cell_type": "code", |
371 | | - "execution_count": null, |
372 | | - "metadata": { |
373 | | - "id": "4H_etYfsEOFP" |
374 | | - }, |
375 | | - "outputs": [], |
376 | | - "source": [ |
377 | | - "# Join the complaints with their embeddings in the same DataFrame\n", |
378 | | - "combined_df = downsampled_issues_df.join(predicted_embeddings)" |
379 | | - ] |
380 | | - }, |
381 | 369 | { |
382 | 370 | "attachments": {}, |
383 | 371 | "cell_type": "markdown", |
|
426 | 414 | "outputs": [], |
427 | 415 | "source": [ |
428 | 416 | "# Use KMeans clustering to calculate our groups. Will take ~3 minutes.\n", |
429 | | - "cluster_model.fit(combined_df[[\"text_embedding\"]])\n", |
430 | | - "clustered_result = cluster_model.predict(combined_df[[\"text_embedding\"]])\n", |
| 417 | + "cluster_model.fit(predicted_embeddings[[\"text_embedding\"]])\n", |
| 418 | + "clustered_result = cluster_model.predict(predicted_embeddings)\n", |
431 | 419 | "# Notice the CENTROID_ID column, which is the ID number of the group that\n", |
432 | 420 | "# each complaint belongs to.\n", |
433 | 421 | "clustered_result.head(n=5)" |
434 | 422 | ] |
435 | 423 | }, |
436 | | - { |
437 | | - "cell_type": "code", |
438 | | - "execution_count": null, |
439 | | - "metadata": {}, |
440 | | - "outputs": [], |
441 | | - "source": [ |
442 | | - "# Join the group number to the complaints and their text embeddings\n", |
443 | | - "combined_clustered_result = combined_df.join(clustered_result)\n", |
444 | | - "combined_clustered_result.head(n=5) " |
445 | | - ] |
446 | | - }, |
447 | 424 | { |
448 | 425 | "attachments": {}, |
449 | 426 | "cell_type": "markdown", |
450 | 427 | "metadata": {}, |
451 | 428 | "source": [ |
452 | | - "Our dataframe combined_clustered_result now has three columns: the complaints, their text embeddings, and an ID from 1-10 (inclusive) indicating which semantically similar group they belong to." |
| 429 | + "Our dataframe combined_clustered_result now has three complaint columns: the content, their text embeddings, and an ID from 1-10 (inclusive) indicating which semantically similar group they belong to." |
453 | 430 | ] |
454 | 431 | }, |
455 | 432 | { |
|
480 | 457 | "source": [ |
481 | 458 | "# Using bigframes, with syntax identical to pandas,\n", |
482 | 459 | "# filter out the first and second groups\n", |
483 | | - "cluster_1_result = combined_clustered_result[\n", |
484 | | - " combined_clustered_result[\"CENTROID_ID\"] == 1\n", |
485 | | - "][[\"consumer_complaint_narrative\"]]\n", |
| 460 | + "cluster_1_result = clustered_result[\n", |
| 461 | + " clustered_result[\"CENTROID_ID\"] == 1\n", |
| 462 | + "][[\"content\"]]\n", |
486 | 463 | "cluster_1_result_pandas = cluster_1_result.head(5).to_pandas()\n", |
487 | 464 | "\n", |
488 | | - "cluster_2_result = combined_clustered_result[\n", |
489 | | - " combined_clustered_result[\"CENTROID_ID\"] == 2\n", |
490 | | - "][[\"consumer_complaint_narrative\"]]\n", |
| 465 | + "cluster_2_result = clustered_result[\n", |
| 466 | + " clustered_result[\"CENTROID_ID\"] == 2\n", |
| 467 | + "][[\"content\"]]\n", |
491 | 468 | "cluster_2_result_pandas = cluster_2_result.head(5).to_pandas()" |
492 | 469 | ] |
493 | 470 | }, |
|
503 | 480 | "prompt1 = 'comment list 1:\\n'\n", |
504 | 481 | "for i in range(5):\n", |
505 | 482 | " prompt1 += str(i + 1) + '. ' + \\\n", |
506 | | - " cluster_1_result_pandas[\"consumer_complaint_narrative\"].iloc[i] + '\\n'\n", |
| 483 | + " cluster_1_result_pandas[\"content\"].iloc[i] + '\\n'\n", |
507 | 484 | "\n", |
508 | 485 | "prompt2 = 'comment list 2:\\n'\n", |
509 | 486 | "for i in range(5):\n", |
510 | 487 | " prompt2 += str(i + 1) + '. ' + \\\n", |
511 | | - " cluster_2_result_pandas[\"consumer_complaint_narrative\"].iloc[i] + '\\n'\n", |
| 488 | + " cluster_2_result_pandas[\"content\"].iloc[i] + '\\n'\n", |
512 | 489 | "\n", |
513 | 490 | "print(prompt1)\n", |
514 | | - "print(prompt2)\n" |
| 491 | + "print(prompt2)" |
515 | 492 | ] |
516 | 493 | }, |
517 | 494 | { |
|
0 commit comments