cartman

2023-02-08 10:22:57 -05:00 · 2023-02-08 10:22:57 -05:00 · d2c8d5dca2
commit d2c8d5dca2
parent 6663e8a366
16 changed files with 219159 additions and 0 deletions
--- a/api/main.py
+++ b/api/main.py
@ -0,0 +1,82 @@
 from fastapi import FastAPI, Request
 from pydantic import BaseModel
 from fastapi.middleware.cors import CORSMiddleware
 from transformers.models.auto.tokenization_auto import AutoTokenizer
 from transformers.models.auto.modeling_auto import AutoModelForCausalLM
 tokenizer = AutoTokenizer.from_pretrained(
    "microsoft/DialoGPT-large", padding_side='left')
 model = AutoModelForCausalLM.from_pretrained(
    "../train/cartman/models/output-medium")
 class Packet(BaseModel):
    message: str
    max_new_tokens: int
    num_beams: int
    num_beam_groups: int
    no_repeat_ngram_size: int
    length_penalty: float
    diversity_penalty: float
    repetition_penalty: float
    early_stopping: bool
 def cartman_respond(packet: Packet) -> str:
    input_ids = tokenizer(packet.message +
                          tokenizer.eos_token, return_tensors="pt").input_ids
    outputs = model.generate(
        input_ids,
        pad_token_id=tokenizer.eos_token_id,
        max_new_tokens=packet.max_new_tokens,
        num_beams=packet.num_beams,
        num_beam_groups=packet.num_beam_groups,
        no_repeat_ngram_size=packet.no_repeat_ngram_size,
        length_penalty=packet.length_penalty,
        diversity_penalty=packet.diversity_penalty,
        repetition_penalty=packet.repetition_penalty,
        early_stopping=packet.early_stopping,
        # do_sample = True,
        # top_k = 100,
        # top_p = 0.7,
        # temperature = 0.8,
    )
    return tokenizer.decode(outputs[:, input_ids.shape[-1]:][0],
                            skip_special_tokens=True)
 api = FastAPI()
 api.add_middleware(
    CORSMiddleware,
    allow_origins=['*'],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
 )
@api.post('/chat/')
 async def getInformation(request: Request) -> dict[str, str]:
    data = await request.json()
    packet = Packet(
        message=data.get('message'),
        max_new_tokens=data.get('max_new_tokens'),
        num_beams=data.get('num_beams'),
        num_beam_groups=data.get('num_beam_groups'),
        no_repeat_ngram_size=data.get('no_repeat_ngram_size'),
        length_penalty=data.get('length_penalty'),
        diversity_penalty=data.get('diversity_penalty'),
        repetition_penalty=data.get('repetition_penalty'),
        early_stopping=data.get('early_stopping'),
    )
    print(packet.message)
    response = cartman_respond(packet)
    print(response)
    return {"Cartman": response}
--- a/api/requirements.txt
+++ b/api/requirements.txt
@ -0,0 +1,31 @@
 anyio==3.6.2
 certifi==2022.12.7
 charset-normalizer==3.0.1
 click==8.1.3
 fastapi==0.89.1
 filelock==3.9.0
 h11==0.14.0
 huggingface-hub==0.12.0
 idna==3.4
 numpy==1.24.2
 nvidia-cublas-cu11==11.10.3.66
 nvidia-cuda-nvrtc-cu11==11.7.99
 nvidia-cuda-runtime-cu11==11.7.99
 nvidia-cudnn-cu11==8.5.0.96
 packaging==23.0
 Pillow==9.4.0
 pydantic==1.10.4
 PyYAML==6.0
 regex==2022.10.31
 requests==2.28.2
 sniffio==1.3.0
 starlette==0.22.0
 tokenizers==0.13.2
 torch==1.13.1
 torchaudio==0.13.1
 torchvision==0.14.1
 tqdm==4.64.1
 transformers==4.26.0
 typing_extensions==4.4.0
 urllib3==1.26.14
 uvicorn==0.20.0
--- a/api/run
+++ b/api/run
@ -0,0 +1,3 @@
 #!/bin/bash
 uvicorn main:api --host 10.0.1.1 --reload
--- a/api/test/test.py
+++ b/api/test/test.py
@ -0,0 +1,24 @@
 import requests
 import json
 while True:
    user_input: str = input('>> ')
    if user_input in 'qx':
        break
    else:
        packet = {
            'message': user_input,
            'max_new_tokens': 20,
            'num_beams': 2,
            'num_beam_groups': 2,
            'no_repeat_ngram_size': 3,
            'length_penalty': 1.4,
            'diversity_penalty': 0.1,
            'repetition_penalty': 2.1,
            'early_stopping': True,
        }
    response = requests.post(
            'http://127.0.0.1:8000/chat/', json=packet)
    print(response.json())
--- a/train/.gitignore
+++ b/train/.gitignore
@ -0,0 +1,3 @@
 __pycache__/
 .ipynb_checkpoints/
 cartman/
--- a/train/clean.ipynb
+++ b/train/clean.ipynb
@ -0,0 +1,939 @@
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "23a7a47d-40c9-4ce2-8e1d-069690edfed3",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2022-10-18T01:39:49.045197Z",
     "iopub.status.busy": "2022-10-18T01:39:49.044788Z",
     "iopub.status.idle": "2022-10-18T01:39:49.325032Z",
     "shell.execute_reply": "2022-10-18T01:39:49.324364Z",
     "shell.execute_reply.started": "2022-10-18T01:39:49.045112Z"
    },
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(70896, 4)\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "df = pd.read_csv('data/All-seasons.csv')\n",
    "print(df.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "cd058417-a7b1-408f-a6b8-d02c114b380d",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2022-10-18T01:39:49.326910Z",
     "iopub.status.busy": "2022-10-18T01:39:49.326699Z",
     "iopub.status.idle": "2022-10-18T01:39:49.339491Z",
     "shell.execute_reply": "2022-10-18T01:39:49.338704Z",
     "shell.execute_reply.started": "2022-10-18T01:39:49.326893Z"
    },
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "2         6416\n",
       "3         5798\n",
       "4         5680\n",
       "6         5131\n",
       "5         4414\n",
       "7         4236\n",
       "1         4170\n",
       "8         3601\n",
       "9         3526\n",
       "11        3478\n",
       "10        3471\n",
       "14        3346\n",
       "12        3307\n",
       "13        3257\n",
       "16        3120\n",
       "15        3101\n",
       "18        2522\n",
       "17        2305\n",
       "Season      17\n",
       "Name: Season, dtype: int64"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.Season.value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "c4da2f8d-a577-49b0-a477-3eab09a38ae9",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2022-10-18T01:39:49.340976Z",
     "iopub.status.busy": "2022-10-18T01:39:49.340661Z",
     "iopub.status.idle": "2022-10-18T01:39:49.371622Z",
     "shell.execute_reply": "2022-10-18T01:39:49.371150Z",
     "shell.execute_reply.started": "2022-10-18T01:39:49.340946Z"
    },
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Cartman                                       9774\n",
       "Evil Cartman                                    23\n",
       "New Cartman                                     18\n",
       "Stan, Kyle, Cartman                             12\n",
       "Kyle, Cartman                                    7\n",
       "Stan, Cartman                                    7\n",
       "Liane and Cartman                                6\n",
       "Cartman Smurf                                    5\n",
       "Future Cartman                                   4\n",
       "Cartman on Left                                  3\n",
       "Stan/Kenny/ Cartman                              3\n",
       "Cartman's Good Side                              3\n",
       "Mrs. Cartman                                     3\n",
       "Stan/Kyle/ Cartman                               3\n",
       "Cartman on Right                                 2\n",
       "Cartman's voice                                  2\n",
       "Both Cartmans                                    2\n",
       "Stan, Kyle, Kenny, Cartman                       2\n",
       "Cartman, Stan                                    2\n",
       "Cartman, Kyle, Kenny                             2\n",
       "Cartman, Kyle                                    2\n",
       "Cartman's Side                                   2\n",
       "Cartman, Choir                                   2\n",
       "Butters, Cartman                                 2\n",
       "Cartman's Bad Side                               2\n",
       "Stan, Kyle, Cartman, Kenny                       1\n",
       "Kenny, Stan, Cartman                             1\n",
       "Stan, Cartman, Kenny                             1\n",
       "Eric Cartman                                     1\n",
       "Wendy, Cartman                                   1\n",
       "Congressman 1, Cartman                           1\n",
       "Cheesy Poof Cartman                              1\n",
       "Cartmans/Boys                                    1\n",
       "Cartman/ Kenny                                   1\n",
       "Cartman's Conscience                             1\n",
       "Kyle/Cartman                                     1\n",
       "Mrs Cartman                                      1\n",
       "Kyle, Stan, Cartman                              1\n",
       "Cartman and Kyle                                 1\n",
       "Cartman (Butters)                                1\n",
       "The Boys (except Cartman) and Dr. Phillips       1\n",
       "Cartman, Butters                                 1\n",
       "Cartman and the Gingers                          1\n",
       "Name: Character, dtype: int64"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df[df.Character.str.contains('artman')].Character.value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "8c2781f6-e93b-49ec-9cbc-2c357b65f239",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2022-10-18T01:39:49.372448Z",
     "iopub.status.busy": "2022-10-18T01:39:49.372292Z",
     "iopub.status.idle": "2022-10-18T01:39:49.381325Z",
     "shell.execute_reply": "2022-10-18T01:39:49.380648Z",
     "shell.execute_reply.started": "2022-10-18T01:39:49.372432Z"
    },
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Season</th>\n",
       "      <th>Episode</th>\n",
       "      <th>Character</th>\n",
       "      <th>Line</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>10</td>\n",
       "      <td>1</td>\n",
       "      <td>Stan</td>\n",
       "      <td>You guys, you guys! Chef is going away. \\n</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>10</td>\n",
       "      <td>1</td>\n",
       "      <td>Kyle</td>\n",
       "      <td>Going away? For how long?\\n</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>10</td>\n",
       "      <td>1</td>\n",
       "      <td>Stan</td>\n",
       "      <td>Forever.\\n</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>10</td>\n",
       "      <td>1</td>\n",
       "      <td>Chef</td>\n",
       "      <td>I'm sorry boys.\\n</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>10</td>\n",
       "      <td>1</td>\n",
       "      <td>Stan</td>\n",
       "      <td>Chef said he's been bored, so he joining a gro...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>10</td>\n",
       "      <td>1</td>\n",
       "      <td>Chef</td>\n",
       "      <td>Wow!\\n</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>10</td>\n",
       "      <td>1</td>\n",
       "      <td>Mrs. Garrison</td>\n",
       "      <td>Chef?? What kind of questions do you think adv...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>10</td>\n",
       "      <td>1</td>\n",
       "      <td>Chef</td>\n",
       "      <td>What's the meaning of life? Why are we here?\\n</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>10</td>\n",
       "      <td>1</td>\n",
       "      <td>Mrs. Garrison</td>\n",
       "      <td>I hope you're making the right choice.\\n</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>10</td>\n",
       "      <td>1</td>\n",
       "      <td>Cartman</td>\n",
       "      <td>I'm gonna miss him.  I'm gonna miss Chef and I...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>10</td>\n",
       "      <td>1</td>\n",
       "      <td>Stan</td>\n",
       "      <td>Dude, how are we gonna go on? Chef was our fuh...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>10</td>\n",
       "      <td>1</td>\n",
       "      <td>Mayor McDaniels</td>\n",
       "      <td>And we will all miss you, Chef,  but we know y...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>10</td>\n",
       "      <td>1</td>\n",
       "      <td>Jimbo</td>\n",
       "      <td>Bye-bye!\\n</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>10</td>\n",
       "      <td>1</td>\n",
       "      <td>Gerald</td>\n",
       "      <td>Good-bye!\\n</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>10</td>\n",
       "      <td>1</td>\n",
       "      <td>Mr. Mackey</td>\n",
       "      <td>So long!\\n</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>10</td>\n",
       "      <td>1</td>\n",
       "      <td>A Man</td>\n",
       "      <td>So long, Chef!\\n</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>10</td>\n",
       "      <td>1</td>\n",
       "      <td>A Sign-Holder</td>\n",
       "      <td>Good-bye, Chef!\\n</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>10</td>\n",
       "      <td>1</td>\n",
       "      <td>Randy</td>\n",
       "      <td>Good-bye, Chef! Have a great time with the Sup...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>10</td>\n",
       "      <td>1</td>\n",
       "      <td>Chef</td>\n",
       "      <td>Good-bye! ..\\n</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>10</td>\n",
       "      <td>1</td>\n",
       "      <td>Kyle</td>\n",
       "      <td>Draw two card, fatass.\\n</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20</th>\n",
       "      <td>10</td>\n",
       "      <td>1</td>\n",
       "      <td>Cartman</td>\n",
       "      <td>Reverse to you, Jew. \\n</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>21</th>\n",
       "      <td>10</td>\n",
       "      <td>1</td>\n",
       "      <td>Stan</td>\n",
       "      <td>I'll get it. \\n</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>22</th>\n",
       "      <td>10</td>\n",
       "      <td>1</td>\n",
       "      <td>Chef</td>\n",
       "      <td>Hello there, children!\\n</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>23</th>\n",
       "      <td>10</td>\n",
       "      <td>1</td>\n",
       "      <td>Stan</td>\n",
       "      <td>He's back!\\n</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24</th>\n",
       "      <td>10</td>\n",
       "      <td>1</td>\n",
       "      <td>Kyle</td>\n",
       "      <td>Yeah!\\n</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Season Episode        Character  \\\n",
       "0      10       1             Stan   \n",
       "1      10       1             Kyle   \n",
       "2      10       1             Stan   \n",
       "3      10       1             Chef   \n",
       "4      10       1             Stan   \n",
       "5      10       1             Chef   \n",
       "6      10       1    Mrs. Garrison   \n",
       "7      10       1             Chef   \n",
       "8      10       1    Mrs. Garrison   \n",
       "9      10       1          Cartman   \n",
       "10     10       1             Stan   \n",
       "11     10       1  Mayor McDaniels   \n",
       "12     10       1            Jimbo   \n",
       "13     10       1           Gerald   \n",
       "14     10       1       Mr. Mackey   \n",
       "15     10       1            A Man   \n",
       "16     10       1    A Sign-Holder   \n",
       "17     10       1            Randy   \n",
       "18     10       1             Chef   \n",
       "19     10       1             Kyle   \n",
       "20     10       1          Cartman   \n",
       "21     10       1             Stan   \n",
       "22     10       1             Chef   \n",
       "23     10       1             Stan   \n",
       "24     10       1             Kyle   \n",
       "\n",
       "                                                 Line  \n",
       "0          You guys, you guys! Chef is going away. \\n  \n",
       "1                         Going away? For how long?\\n  \n",
       "2                                          Forever.\\n  \n",
       "3                                   I'm sorry boys.\\n  \n",
       "4   Chef said he's been bored, so he joining a gro...  \n",
       "5                                              Wow!\\n  \n",
       "6   Chef?? What kind of questions do you think adv...  \n",
       "7      What's the meaning of life? Why are we here?\\n  \n",
       "8            I hope you're making the right choice.\\n  \n",
       "9   I'm gonna miss him.  I'm gonna miss Chef and I...  \n",
       "10  Dude, how are we gonna go on? Chef was our fuh...  \n",
       "11  And we will all miss you, Chef,  but we know y...  \n",
       "12                                         Bye-bye!\\n  \n",
       "13                                        Good-bye!\\n  \n",
       "14                                         So long!\\n  \n",
       "15                                   So long, Chef!\\n  \n",
       "16                                  Good-bye, Chef!\\n  \n",
       "17  Good-bye, Chef! Have a great time with the Sup...  \n",
       "18                                     Good-bye! ..\\n  \n",
       "19                           Draw two card, fatass.\\n  \n",
       "20                            Reverse to you, Jew. \\n  \n",
       "21                                    I'll get it. \\n  \n",
       "22                           Hello there, children!\\n  \n",
       "23                                       He's back!\\n  \n",
       "24                                            Yeah!\\n  "
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.head(25)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "2c267ff5-a11a-426b-9034-8ee776b800e7",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2022-10-18T01:39:49.382208Z",
     "iopub.status.busy": "2022-10-18T01:39:49.382029Z",
     "iopub.status.idle": "2022-10-18T01:39:49.407971Z",
     "shell.execute_reply": "2022-10-18T01:39:49.407239Z",
     "shell.execute_reply.started": "2022-10-18T01:39:49.382191Z"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "cleanlines = pd.Series([cell.replace('\\n','').strip() for cell in df.Line])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "4c22081b-e255-45b5-a2f5-ea4376b26434",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2022-10-18T01:39:49.408983Z",
     "iopub.status.busy": "2022-10-18T01:39:49.408787Z",
     "iopub.status.idle": "2022-10-18T01:39:49.416631Z",
     "shell.execute_reply": "2022-10-18T01:39:49.415993Z",
     "shell.execute_reply.started": "2022-10-18T01:39:49.408965Z"
    },
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>You guys, you guys! Chef is going away.</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Going away? For how long?</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Forever.</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>I'm sorry boys.</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Chef said he's been bored, so he joining a gro...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>Wow!</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>Chef?? What kind of questions do you think adv...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>What's the meaning of life? Why are we here?</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>I hope you're making the right choice.</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>I'm gonna miss him.  I'm gonna miss Chef and I...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>Dude, how are we gonna go on? Chef was our fuh...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>And we will all miss you, Chef,  but we know y...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>Bye-bye!</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>Good-bye!</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>So long!</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>So long, Chef!</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>Good-bye, Chef!</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>Good-bye, Chef! Have a great time with the Sup...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>Good-bye! ..</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>Draw two card, fatass.</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20</th>\n",
       "      <td>Reverse to you, Jew.</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>21</th>\n",
       "      <td>I'll get it.</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>22</th>\n",
       "      <td>Hello there, children!</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>23</th>\n",
       "      <td>He's back!</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24</th>\n",
       "      <td>Yeah!</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                    0\n",
       "0             You guys, you guys! Chef is going away.\n",
       "1                           Going away? For how long?\n",
       "2                                            Forever.\n",
       "3                                     I'm sorry boys.\n",
       "4   Chef said he's been bored, so he joining a gro...\n",
       "5                                                Wow!\n",
       "6   Chef?? What kind of questions do you think adv...\n",
       "7        What's the meaning of life? Why are we here?\n",
       "8              I hope you're making the right choice.\n",
       "9   I'm gonna miss him.  I'm gonna miss Chef and I...\n",
       "10  Dude, how are we gonna go on? Chef was our fuh...\n",
       "11  And we will all miss you, Chef,  but we know y...\n",
       "12                                           Bye-bye!\n",
       "13                                          Good-bye!\n",
       "14                                           So long!\n",
       "15                                     So long, Chef!\n",
       "16                                    Good-bye, Chef!\n",
       "17  Good-bye, Chef! Have a great time with the Sup...\n",
       "18                                       Good-bye! ..\n",
       "19                             Draw two card, fatass.\n",
       "20                               Reverse to you, Jew.\n",
       "21                                       I'll get it.\n",
       "22                             Hello there, children!\n",
       "23                                         He's back!\n",
       "24                                              Yeah!"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.DataFrame(cleanlines).head(25)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "b8a4c7ec-e4c5-43c5-89e9-09f661304938",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2022-10-18T01:39:49.419786Z",
     "iopub.status.busy": "2022-10-18T01:39:49.419347Z",
     "iopub.status.idle": "2022-10-18T01:39:49.423453Z",
     "shell.execute_reply": "2022-10-18T01:39:49.422770Z",
     "shell.execute_reply.started": "2022-10-18T01:39:49.419765Z"
    },
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0\n"
     ]
    }
   ],
   "source": [
    "print(df.shape[0] - cleanlines.shape[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "993a4b52-e98b-494f-a48c-d4adbd57f510",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2022-10-18T01:39:49.424465Z",
     "iopub.status.busy": "2022-10-18T01:39:49.424197Z",
     "iopub.status.idle": "2022-10-18T01:39:49.430702Z",
     "shell.execute_reply": "2022-10-18T01:39:49.429900Z",
     "shell.execute_reply.started": "2022-10-18T01:39:49.424442Z"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "train = pd.DataFrame(df.Character)\n",
    "train['line'] = cleanlines\n",
    "train.columns = ['name','line']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "148a5ba3-4918-4421-a19a-68a84251cd7b",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2022-10-18T01:39:49.432232Z",
     "iopub.status.busy": "2022-10-18T01:39:49.431864Z",
     "iopub.status.idle": "2022-10-18T01:39:49.443756Z",
     "shell.execute_reply": "2022-10-18T01:39:49.442701Z",
     "shell.execute_reply.started": "2022-10-18T01:39:49.432200Z"
    },
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>name</th>\n",
       "      <th>line</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Stan</td>\n",
       "      <td>You guys, you guys! Chef is going away.</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Kyle</td>\n",
       "      <td>Going away? For how long?</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Stan</td>\n",
       "      <td>Forever.</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Chef</td>\n",
       "      <td>I'm sorry boys.</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Stan</td>\n",
       "      <td>Chef said he's been bored, so he joining a gro...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>Chef</td>\n",
       "      <td>Wow!</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>Mrs. Garrison</td>\n",
       "      <td>Chef?? What kind of questions do you think adv...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>Chef</td>\n",
       "      <td>What's the meaning of life? Why are we here?</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>Mrs. Garrison</td>\n",
       "      <td>I hope you're making the right choice.</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>Cartman</td>\n",
       "      <td>I'm gonna miss him.  I'm gonna miss Chef and I...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>Stan</td>\n",
       "      <td>Dude, how are we gonna go on? Chef was our fuh...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>Mayor McDaniels</td>\n",
       "      <td>And we will all miss you, Chef,  but we know y...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>Jimbo</td>\n",
       "      <td>Bye-bye!</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>Gerald</td>\n",
       "      <td>Good-bye!</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>Mr. Mackey</td>\n",
       "      <td>So long!</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>A Man</td>\n",
       "      <td>So long, Chef!</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>A Sign-Holder</td>\n",
       "      <td>Good-bye, Chef!</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>Randy</td>\n",
       "      <td>Good-bye, Chef! Have a great time with the Sup...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>Chef</td>\n",
       "      <td>Good-bye! ..</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>Kyle</td>\n",
       "      <td>Draw two card, fatass.</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20</th>\n",
       "      <td>Cartman</td>\n",
       "      <td>Reverse to you, Jew.</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>21</th>\n",
       "      <td>Stan</td>\n",
       "      <td>I'll get it.</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>22</th>\n",
       "      <td>Chef</td>\n",
       "      <td>Hello there, children!</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>23</th>\n",
       "      <td>Stan</td>\n",
       "      <td>He's back!</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24</th>\n",
       "      <td>Kyle</td>\n",
       "      <td>Yeah!</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "               name                                               line\n",
       "0              Stan            You guys, you guys! Chef is going away.\n",
       "1              Kyle                          Going away? For how long?\n",
       "2              Stan                                           Forever.\n",
       "3              Chef                                    I'm sorry boys.\n",
       "4              Stan  Chef said he's been bored, so he joining a gro...\n",
       "5              Chef                                               Wow!\n",
       "6     Mrs. Garrison  Chef?? What kind of questions do you think adv...\n",
       "7              Chef       What's the meaning of life? Why are we here?\n",
       "8     Mrs. Garrison             I hope you're making the right choice.\n",
       "9           Cartman  I'm gonna miss him.  I'm gonna miss Chef and I...\n",
       "10             Stan  Dude, how are we gonna go on? Chef was our fuh...\n",
       "11  Mayor McDaniels  And we will all miss you, Chef,  but we know y...\n",
       "12            Jimbo                                           Bye-bye!\n",
       "13           Gerald                                          Good-bye!\n",
       "14       Mr. Mackey                                           So long!\n",
       "15            A Man                                     So long, Chef!\n",
       "16    A Sign-Holder                                    Good-bye, Chef!\n",
       "17            Randy  Good-bye, Chef! Have a great time with the Sup...\n",
       "18             Chef                                       Good-bye! ..\n",
       "19             Kyle                             Draw two card, fatass.\n",
       "20          Cartman                               Reverse to you, Jew.\n",
       "21             Stan                                       I'll get it.\n",
       "22             Chef                             Hello there, children!\n",
       "23             Stan                                         He's back!\n",
       "24             Kyle                                              Yeah!"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train.head(25)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "7466b2a6-b579-4bac-a515-df4f040a7b27",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2022-10-18T01:39:49.445428Z",
     "iopub.status.busy": "2022-10-18T01:39:49.445096Z",
     "iopub.status.idle": "2022-10-18T01:39:49.615700Z",
     "shell.execute_reply": "2022-10-18T01:39:49.614962Z",
     "shell.execute_reply.started": "2022-10-18T01:39:49.445397Z"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "train.to_csv('data/train.csv',index=False)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }
--- a/train/data/All-seasons.csv
+++ b/train/data/All-seasons.csv
--- a/train/data/train.csv
+++ b/train/data/train.csv
--- a/train/test/beam.py
+++ b/train/test/beam.py
@ -0,0 +1,26 @@
 from transformers.models.auto.tokenization_auto import AutoTokenizer
 from transformers.models.auto.modeling_auto import AutoModelForCausalLM
 import torch
 tokenizer = AutoTokenizer.from_pretrained('microsoft/DialoGPT-medium')
 model = AutoModelForCausalLM.from_pretrained('../output-medium')
 # chatting 5 times with beam search
 for step in range(5):
    # take user input
    text = input(">> You:")
    # encode the input and add end of string token
    input_ids = tokenizer.encode(text + tokenizer.eos_token, return_tensors="pt")
    # concatenate new user input with chat history (if there is)
    bot_input_ids = torch.cat([chat_history_ids, input_ids], dim=-1) if step > 0 else input_ids
    # generate a bot response
    chat_history_ids = model.generate(
        bot_input_ids,
        max_length=1000,
        num_beams=3,
        early_stopping=True,
        pad_token_id=tokenizer.eos_token_id
    )
    #print the output
    output = tokenizer.decode(chat_history_ids[:, bot_input_ids.shape[-1]:][0], skip_special_tokens=True)
    print(f"Cartman: {output}")
--- a/train/test/greedy.py
+++ b/train/test/greedy.py
@ -0,0 +1,25 @@
 from transformers.models.auto.tokenization_auto import AutoTokenizer
 from transformers.models.auto.modeling_auto import AutoModelForCausalLM
 import torch
 tokenizer = AutoTokenizer.from_pretrained('microsoft/DialoGPT-large')
 model = AutoModelForCausalLM.from_pretrained('microsoft/DialoGPT-large')
 # model = AutoModelForCausalLM.from_pretrained('../output-medium')
 # chatting 5 times with greedy search
 for step in range(5):
    # take user input
    text = input(">> You: ")
    # encode the input and add end of string token
    input_ids = tokenizer.encode(text + tokenizer.eos_token, return_tensors="pt")
    # concatenate new user input with chat history (if there is)
    bot_input_ids = torch.cat([chat_history_ids, input_ids], dim=-1) if step > 0 else input_ids
    # generate a bot response
    chat_history_ids = model.generate(
        bot_input_ids,
        max_length=1000,
        pad_token_id=tokenizer.eos_token_id,
    )
    #print the output
    output = tokenizer.decode(chat_history_ids[:, bot_input_ids.shape[-1]:][0], skip_special_tokens=True)
    print(f"Bot: {output}")
--- a/train/test/nucleus.py
+++ b/train/test/nucleus.py
@ -0,0 +1,34 @@
 from transformers.models.auto.tokenization_auto import AutoTokenizer
 from transformers.models.auto.modeling_auto import AutoModelForCausalLM
 import torch
 tokenizer = AutoTokenizer.from_pretrained('microsoft/DialoGPT-medium')
 model = AutoModelForCausalLM.from_pretrained('microsoft/DialoGPT-medium')
 #model = AutoModelForCausalLM.from_pretrained('../output-medium')
 # chatting 5 times with nucleus & top-k sampling & tweaking temperature & multiple
 # sentences
 for step in range(5):
    # take user input
    text = input(">> You: ")
    # encode the input and add end of string token
    input_ids = tokenizer.encode(text + tokenizer.eos_token, return_tensors="pt")
    # concatenate new user input with chat history (if there is)
    bot_input_ids = torch.cat([chat_history_ids, input_ids], dim=-1) if step > 0 else input_ids
    # generate a bot response
    chat_history_ids_list = model.generate(
        bot_input_ids,
        max_length=1000,
        do_sample=True,
        top_p=0.95,
        top_k=50,
        temperature=0.75,
        num_return_sequences=5,
        pad_token_id=tokenizer.eos_token_id
    )
    #print the outputs
    for i in range(len(chat_history_ids_list)):
      output = tokenizer.decode(chat_history_ids_list[i][bot_input_ids.shape[-1]:], skip_special_tokens=True)
      print(f"Cartman {i}: {output}")
    choice_index = int(input("Choose the response you want for the next input: "))
    chat_history_ids = torch.unsqueeze(chat_history_ids_list[choice_index], dim=0)
--- a/train/test/sample.py
+++ b/train/test/sample.py
@ -0,0 +1,26 @@
 from transformers.models.auto.tokenization_auto import AutoTokenizer
 from transformers.models.auto.modeling_auto import AutoModelForCausalLM
 import torch
 tokenizer = AutoTokenizer.from_pretrained('microsoft/DialoGPT-medium')
 model = AutoModelForCausalLM.from_pretrained('../output-medium')
 # chatting 5 times with sampling
 for step in range(5):
    # take user input
    text = input(">> You: ")
    # encode the input and add end of string token
    input_ids = tokenizer.encode(text + tokenizer.eos_token, return_tensors="pt")
    # concatenate new user input with chat history (if there is)
    bot_input_ids = torch.cat([chat_history_ids, input_ids], dim=-1) if step > 0 else input_ids
    # generate a bot response
    chat_history_ids = model.generate(
        bot_input_ids,
        max_length=1000,
        do_sample=True,
        top_k=0,
        pad_token_id=tokenizer.eos_token_id
    )
    #print the output
    output = tokenizer.decode(chat_history_ids[:, bot_input_ids.shape[-1]:][0], skip_special_tokens=True)
    print(f"Cartman: {output}")
--- a/train/test/sample_topk.py
+++ b/train/test/sample_topk.py
@ -0,0 +1,27 @@
 from transformers.models.auto.tokenization_auto import AutoTokenizer
 from transformers.models.auto.modeling_auto import AutoModelForCausalLM
 import torch
 tokenizer = AutoTokenizer.from_pretrained('microsoft/DialoGPT-medium')
 model = AutoModelForCausalLM.from_pretrained('../output-medium')
 # chatting 5 times with Top K sampling & tweaking temperature
 for step in range(5):
    # take user input
    text = input(">> You: ")
    # encode the input and add end of string token
    input_ids = tokenizer.encode(text + tokenizer.eos_token, return_tensors="pt")
    # concatenate new user input with chat history (if there is)
    bot_input_ids = torch.cat([chat_history_ids, input_ids], dim=-1) if step > 0 else input_ids
    # generate a bot response
    chat_history_ids = model.generate(
        bot_input_ids,
        max_length=1000,
        do_sample=True,
        top_k=100,
        temperature=0.75,
        pad_token_id=tokenizer.eos_token_id
    )
    #print the output
    output = tokenizer.decode(chat_history_ids[:, bot_input_ids.shape[-1]:][0], skip_special_tokens=True)
    print(f"Cartman: {output}")
--- a/train/test/top_p.py
+++ b/train/test/top_p.py
@ -0,0 +1,29 @@
 from transformers.models.auto.tokenization_auto import AutoTokenizer
 from transformers.models.auto.modeling_auto import AutoModelForCausalLM
 import torch
 tokenizer = AutoTokenizer.from_pretrained('microsoft/DialoGPT-large')
 model = AutoModelForCausalLM.from_pretrained('microsoft/DialoGPT-large')
 #model = AutoModelForCausalLM.from_pretrained('../output-medium')
 # chatting 5 times with nucleus sampling & tweaking temperature
 for step in range(10):
    # take user input
    text = input(">> You: ")
    # encode the input and add end of string token
    input_ids = tokenizer.encode(text + tokenizer.eos_token, return_tensors="pt")
    # concatenate new user input with chat history (if there is)
    bot_input_ids = torch.cat([chat_history_ids, input_ids], dim=-1) if step > 0 else input_ids
    # generate a bot response
    chat_history_ids = model.generate(
        bot_input_ids,
        max_length=1000,
        do_sample=True,
        top_p=0.95,
        top_k=0,
        temperature=0.75,
        pad_token_id=tokenizer.eos_token_id
    )
    #print the output
    output = tokenizer.decode(chat_history_ids[:, bot_input_ids.shape[-1]:][0], skip_special_tokens=True)
    print(f"Cartman: {output}")
--- a/train/train.ipynb
+++ b/train/train.ipynb
--- a/train/train.py
+++ b/train/train.py
@ -0,0 +1,571 @@
 # all the imports
 import glob
 import logging
 import os
 import pickle
 import random
 import re
 import shutil
 from typing import Dict, List, Tuple
 import numpy as np
 import pandas as pd
 from sklearn.model_selection import train_test_split
 from torch.nn.utils.rnn import pad_sequence
 from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler
 from torch.utils.data.distributed import DistributedSampler
 from tqdm.notebook import tqdm, trange
 from pathlib import Path
 from transformers import (
    MODEL_WITH_LM_HEAD_MAPPING,
    WEIGHTS_NAME,
    AdamW,
    AutoConfig,
    PreTrainedModel,
    PreTrainedTokenizer,
    get_linear_schedule_with_warmup,
 )
 try:
    from torch.utils.tensorboard import SummaryWriter
 except ImportError:
    from tensorboardX import SummaryWriter
 # --------------------------------------------------------------------------
 data = pd.read_csv('data/train.csv')
 CHARACTER_NAME = 'TARGET'
 contexted = []
 # context window of size 7
 n = 7
 for i in data[data.name == CHARACTER_NAME].index:
    if i < n:
        continue
    row = []
    prev = i - 1 - n # we additionally substract 1, so row will contain current response and 7 previous responses  
    for j in range(i, prev, -1):
        row.append(data.line[j])
        contexted.append(row)
 columns = ['response', 'context'] 
 columns = columns + ['context/' + str(i) for i in range(n - 1)]
 df = pd.DataFrame.from_records(contexted, columns=columns)
 trn_df, val_df = train_test_split(df, test_size=0.1)
 # create dataset suitable for our model
 def construct_conv(row, tokenizer, eos = True):
    flatten = lambda l: [item for sublist in l for item in sublist]
    conv = list(reversed([tokenizer.encode(x) + [tokenizer.eos_token_id] for x in row]))
    conv = flatten(conv)
    return conv
 class ConversationDataset(Dataset):
    def __init__(self, tokenizer: PreTrainedTokenizer, args, df, block_size=512):
        block_size = block_size - (tokenizer.model_max_length - tokenizer.max_len_single_sentence)
        directory = args.cache_dir
        cached_features_file = os.path.join(
            directory, args.model_type + "_cached_lm_" + str(block_size)
        )
        if os.path.exists(cached_features_file) and not args.overwrite_cache:
            logger.info("Loading features from cached file %s", cached_features_file)
            with open(cached_features_file, "rb") as handle:
                self.examples = pickle.load(handle)
        else:
            logger.info("Creating features from dataset file at %s", directory)
            self.examples = []
            for _, row in df.iterrows():
                conv = construct_conv(row, tokenizer)
                self.examples.append(conv)
            logger.info("Saving features into cached file %s", cached_features_file)
            with open(cached_features_file, "wb") as handle:
                pickle.dump(self.examples, handle, protocol=pickle.HIGHEST_PROTOCOL)
    def __len__(self):
        return len(self.examples)
    def __getitem__(self, item):
        return torch.tensor(self.examples[item], dtype=torch.long)
 # Cacheing and storing of data/checkpoints
 def load_and_cache_examples(args, tokenizer, df_trn, df_val, evaluate=False):
    return ConversationDataset(tokenizer, args, df_val if evaluate else df_trn)
 def set_seed(args):
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if args.n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)
 def _sorted_checkpoints(args, checkpoint_prefix="checkpoint", use_mtime=False) -> List[str]:
    ordering_and_checkpoint_path = []
    glob_checkpoints = glob.glob(os.path.join(args.output_dir, "{}-*".format(checkpoint_prefix)))
    for path in glob_checkpoints:
        if use_mtime:
            ordering_and_checkpoint_path.append((os.path.getmtime(path), path))
        else:
            regex_match = re.match(".*{}-([0-9]+)".format(checkpoint_prefix), path)
            if regex_match and regex_match.groups():
                ordering_and_checkpoint_path.append((int(regex_match.groups()[0]), path))
    checkpoints_sorted = sorted(ordering_and_checkpoint_path)
    checkpoints_sorted = [checkpoint[1] for checkpoint in checkpoints_sorted]
    return checkpoints_sorted
 def _rotate_checkpoints(args, checkpoint_prefix="checkpoint", use_mtime=False) -> None:
    if not args.save_total_limit:
        return
    if args.save_total_limit <= 0:
        return
    # Check if we should delete older checkpoint(s)
    checkpoints_sorted = _sorted_checkpoints(args, checkpoint_prefix, use_mtime)
    if len(checkpoints_sorted) <= args.save_total_limit:
        return
    number_of_checkpoints_to_delete = max(0, len(checkpoints_sorted) - args.save_total_limit)
    checkpoints_to_be_deleted = checkpoints_sorted[:number_of_checkpoints_to_delete]
    for checkpoint in checkpoints_to_be_deleted:
        logger.info("Deleting older checkpoint [{}] due to args.save_total_limit".format(checkpoint))
        shutil.rmtree(checkpoint)
 from transformers import AutoModelWithLMHead, AutoModelForCausalLM, AutoTokenizer
 import torch
 tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-large")
 model = AutoModelForCausalLM.from_pretrained("microsoft/DialoGPT-large")
 """
 Fine-tuning the library models for language modeling on a text file (GPT, GPT-2, BERT, RoBERTa).
 GPT and GPT-2 are fine-tuned using a causal language modeling (CLM) loss while BERT and RoBERTa are fine-tuned
 using a masked language modeling (MLM) loss.
 """
 # Configs
 logger = logging.getLogger(__name__)
 MODEL_CONFIG_CLASSES = list(MODEL_WITH_LM_HEAD_MAPPING.keys())
 MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
 # Args to allow for easy conversion of python script to notebook
 class Args():
    def __init__(self):
        self.output_dir = 'models/output-medium'
        self.model_type = 'gpt2'
        self.model_name_or_path = 'microsoft/DialoGPT-large'
        self.config_name = 'microsoft/DialoGPT-large'
        self.tokenizer_name = 'microsoft/DialoGPT-large'
        self.cache_dir = 'cached'
        self.block_size = 512
        self.do_train = True
        self.do_eval = True
        self.evaluate_during_training = False
        self.per_gpu_train_batch_size = 4
        self.per_gpu_eval_batch_size = 4
        self.gradient_accumulation_steps = 1
        self.learning_rate = 5e-5
        self.weight_decay = 0.0
        self.adam_epsilon = 1e-8
        self.max_grad_norm = 1.0
        self.num_train_epochs = 4
        self.max_steps = -1
        self.warmup_steps = 0
        self.logging_steps = 1000
        self.save_steps = 3500
        self.save_total_limit = None
        self.eval_all_checkpoints = False
        self.no_cuda = False
        self.overwrite_output_dir = True
        self.overwrite_cache = True
        self.should_continue = False
        self.seed = 42
        self.local_rank = -1
        self.fp16 = False
        self.fp16_opt_level = 'O1'
 args = Args()
 def train(args, train_dataset, model: PreTrainedModel, tokenizer: PreTrainedTokenizer) -> Tuple[int, float]:
    """ Train the model """
    if args.local_rank in [-1, 0]:
        tb_writer = SummaryWriter()
    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
    def collate(examples: List[torch.Tensor]):
        if tokenizer._pad_token is None:
            return pad_sequence(examples, batch_first=True)
        return pad_sequence(examples, batch_first=True, padding_value=tokenizer.pad_token_id)
    train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
    train_dataloader = DataLoader(
        train_dataset, sampler=train_sampler, batch_size=args.train_batch_size, collate_fn=collate, drop_last = True
    )
    if args.max_steps > 0:
        t_total = args.max_steps
        args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
    else:
        t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
    model = model.module if hasattr(model, "module") else model  # Take care of distributed/parallel training
    model.resize_token_embeddings(len(tokenizer))
    # add_special_tokens_(model, tokenizer)
    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
            "weight_decay": args.weight_decay,
        },
        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
    ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
    )
    # Check if saved optimizer or scheduler states exist
    if (
        args.model_name_or_path
        and os.path.isfile(os.path.join(args.model_name_or_path, "optimizer.pt"))
        and os.path.isfile(os.path.join(args.model_name_or_path, "scheduler.pt"))
    ):
        # Load in optimizer and scheduler states
        optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "optimizer.pt")))
        scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "scheduler.pt")))
    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
        model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)
    # multi-gpu training (should be after apex fp16 initialization)
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)
    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True
        )
    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
    logger.info(
        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
        args.train_batch_size
        * args.gradient_accumulation_steps
        * (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
    )
    logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)
    global_step = 0
    epochs_trained = 0
    steps_trained_in_current_epoch = 0
    # Check if continuing training from a checkpoint
    if args.model_name_or_path and os.path.exists(args.model_name_or_path):
        try:
            # set global_step to gobal_step of last saved checkpoint from model path
            checkpoint_suffix = args.model_name_or_path.split("-")[-1].split("/")[0]
            global_step = int(checkpoint_suffix)
            epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps)
            steps_trained_in_current_epoch = global_step % (len(train_dataloader) // args.gradient_accumulation_steps)
            logger.info("  Continuing training from checkpoint, will skip to saved global_step")
            logger.info("  Continuing training from epoch %d", epochs_trained)
            logger.info("  Continuing training from global step %d", global_step)
            logger.info("  Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch)
        except ValueError:
            logger.info("  Starting fine-tuning.")
    tr_loss, logging_loss = 0.0, 0.0
    model.zero_grad()
    train_iterator = trange(
        epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]
    )
    set_seed(args)  # Added here for reproducibility
    for _ in train_iterator:
        epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
        for step, batch in enumerate(epoch_iterator):
            # Skip past any already trained steps if resuming training
            if steps_trained_in_current_epoch > 0:
                steps_trained_in_current_epoch -= 1
                continue
            inputs, labels = (batch, batch)
            if inputs.shape[1] > 1024: continue
            inputs = inputs.to(args.device)
            labels = labels.to(args.device)
            model.train()
            outputs = model(inputs, labels=labels)
            loss = outputs[0]  # model outputs are always tuple in transformers (see doc)
            if args.n_gpu > 1:
                loss = loss.mean()  # mean() to average on multi-gpu parallel training
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps
            if args.fp16:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()
            tr_loss += loss.item()
            if (step + 1) % args.gradient_accumulation_steps == 0:
                if args.fp16:
                    torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
                else:
                    torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()
                global_step += 1
                if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    # Log metrics
                    if (
                        args.local_rank == -1 and args.evaluate_during_training
                    ):  # Only evaluate when single GPU otherwise metrics may not average well
                        results = evaluate(args, model, tokenizer)
                        for key, value in results.items():
                            tb_writer.add_scalar("eval_{}".format(key), value, global_step)
                    tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step)
                    tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step)
                    logging_loss = tr_loss
                if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
                    checkpoint_prefix = "checkpoint"
                    # Save model checkpoint
                    output_dir = os.path.join(args.output_dir, "{}-{}".format(checkpoint_prefix, global_step))
                    os.makedirs(output_dir, exist_ok=True)
                    model_to_save = (
                        model.module if hasattr(model, "module") else model
                    )  # Take care of distributed/parallel training
                    model_to_save.save_pretrained(output_dir)
                    tokenizer.save_pretrained(output_dir)
                    torch.save(args, os.path.join(output_dir, "training_args.bin"))
                    logger.info("Saving model checkpoint to %s", output_dir)
                    _rotate_checkpoints(args, checkpoint_prefix)
                    torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt"))
                    torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt"))
                    logger.info("Saving optimizer and scheduler states to %s", output_dir)
            if args.max_steps > 0 and global_step > args.max_steps:
                epoch_iterator.close()
                break
        if args.max_steps > 0 and global_step > args.max_steps:
            train_iterator.close()
            break
    if args.local_rank in [-1, 0]:
        tb_writer.close()
    return global_step, tr_loss / global_step
 # Evaluation of some model
 def evaluate(args, model: PreTrainedModel, tokenizer: PreTrainedTokenizer, df_trn, df_val, prefix="") -> Dict:
    # Loop to handle MNLI double evaluation (matched, mis-matched)
    eval_output_dir = args.output_dir
    eval_dataset = load_and_cache_examples(args, tokenizer, df_trn, df_val, evaluate=True)
    os.makedirs(eval_output_dir, exist_ok=True)
    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
    # Note that DistributedSampler samples randomly
    def collate(examples: List[torch.Tensor]):
        if tokenizer._pad_token is None:
            return pad_sequence(examples, batch_first=True)
        return pad_sequence(examples, batch_first=True, padding_value=tokenizer.pad_token_id)
    eval_sampler = SequentialSampler(eval_dataset)
    eval_dataloader = DataLoader(
        eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size, collate_fn=collate, drop_last = True
    )
    # multi-gpu evaluate
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)
    # Eval!
    logger.info("***** Running evaluation {} *****".format(prefix))
    logger.info("  Num examples = %d", len(eval_dataset))
    logger.info("  Batch size = %d", args.eval_batch_size)
    eval_loss = 0.0
    nb_eval_steps = 0
    model.eval()
    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        inputs, labels = (batch, batch)
        inputs = inputs.to(args.device)
        labels = labels.to(args.device)
        with torch.no_grad():
            outputs = model(inputs, labels=labels)
            lm_loss = outputs[0]
            eval_loss += lm_loss.mean().item()
        nb_eval_steps += 1
    eval_loss = eval_loss / nb_eval_steps
    perplexity = torch.exp(torch.tensor(eval_loss))
    result = {"perplexity": perplexity}
    output_eval_file = os.path.join(eval_output_dir, prefix, "eval_results.txt")
    with open(output_eval_file, "w") as writer:
        logger.info("***** Eval results {} *****".format(prefix))
        for key in sorted(result.keys()):
            logger.info("  %s = %s", key, str(result[key]))
            writer.write("%s = %s\n" % (key, str(result[key])))
    return result
 # Main runner
 def main(df_trn, df_val):
    args = Args()
    if args.should_continue:
        sorted_checkpoints = _sorted_checkpoints(args)
        if len(sorted_checkpoints) == 0:
            raise ValueError("Used --should_continue but no checkpoint was found in --output_dir.")
        else:
            args.model_name_or_path = sorted_checkpoints[-1]
    if (
        os.path.exists(args.output_dir)
        and os.listdir(args.output_dir)
        and args.do_train
        and not args.overwrite_output_dir
        and not args.should_continue
    ):
        raise ValueError(
            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(
                args.output_dir
            )
        )
    # Setup CUDA, GPU & distributed training
    device = torch.device("cuda")
    args.n_gpu = torch.cuda.device_count()
    args.device = device
    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
    )
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        args.local_rank,
        device,
        args.n_gpu,
        bool(args.local_rank != -1),
        args.fp16,
    )
    # Set seed
    set_seed(args)
    config = AutoConfig.from_pretrained(args.config_name, cache_dir=args.cache_dir)
    tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name, cache_dir=args.cache_dir)
    model = AutoModelForCausalLM.from_pretrained(
        args.model_name_or_path,
        from_tf=False,
        config=config,
        cache_dir=args.cache_dir,
    )
    model.to(args.device)
    logger.info("Training/evaluation parameters %s", args)
    # Training
    if args.do_train:
        train_dataset = load_and_cache_examples(args, tokenizer, df_trn, df_val, evaluate=False)
        global_step, tr_loss = train(args, train_dataset, model, tokenizer)
        logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
    # Saving best-practices: if you use save_pretrained for the model and tokenizer, you can reload them using from_pretrained()
    if args.do_train:
        # Create output directory if needed
        os.makedirs(args.output_dir, exist_ok=True)
        logger.info("Saving model checkpoint to %s", args.output_dir)
        # Save a trained model, configuration and tokenizer using `save_pretrained()`.
        # They can then be reloaded using `from_pretrained()`
        model_to_save = (
            model.module if hasattr(model, "module") else model
        )  # Take care of distributed/parallel training
        model_to_save.save_pretrained(args.output_dir)
        tokenizer.save_pretrained(args.output_dir)
        # Good practice: save your training arguments together with the trained model
        torch.save(args, os.path.join(args.output_dir, "training_args.bin"))
        # Load a trained model and vocabulary that you have fine-tuned
        model = AutoModelForCausalLM.from_pretrained(args.output_dir)
        tokenizer = AutoTokenizer.from_pretrained(args.output_dir)
        model.to(args.device)
    # Evaluation
    results = {}
    if args.do_eval and args.local_rank in [-1, 0]:
        checkpoints = [args.output_dir]
        if args.eval_all_checkpoints:
            checkpoints = list(
                os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True))
            )
            logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN)  # Reduce logging
        logger.info("Evaluate the following checkpoints: %s", checkpoints)
        for checkpoint in checkpoints:
            global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else ""
            prefix = checkpoint.split("/")[-1] if checkpoint.find("checkpoint") != -1 else ""
            model = AutoModelForCausalLM.from_pretrained(checkpoint)
            model.to(args.device)
            result = evaluate(args, model, tokenizer, df_trn, df_val, prefix=prefix)
            result = dict((k + "_{}".format(global_step), v) for k, v in result.items())
            results.update(result)
    return results
 main(trn_df, val_df)
		`@ -0,0 +1,3 @@`
							`#!/bin/bash`

							`uvicorn main:api --host 10.0.1.1 --reload`