Getting Structured Data from ChatGPT

Using Pydantic for OpenAI function calling
OpenAI
Published

February 16, 2024

Keywords

function calling, ChatGPT, Pydantic

This notebook uses pydantic and ChatGPT API’s function calling to extract details about a protest event from a newspaper article. In the old days, you had to ask it to provide a JSON-like object. Next, I defined the JSONs myself in the functions. Now I’m learning to use pydantic.

pip install openai pydantic -q
Note: you may need to restart the kernel to use updated packages.
from datetime import date
from enum import Enum
import json

from pydantic import BaseModel, Field
from typing import Optional, List
from openai import OpenAI
import pandas as pd
class WeekDay(Enum):
    Monday = "Monday"
    Tuesday = "Tuesday"
    Wednesday = "Wednesday"
    Thursday = "Thursday"
    Friday = "Friday"
    Saturday = "Saturday"
    Sunday = "Sunday"


class StateAB(Enum):
    AK = "AK"
    AL = "AL"
    AR = "AR"
    AZ = "AZ"
    CA = "CA"
    CO = "CO"
    CT = "CT"
    DC = "DC"
    DE = "DE"
    FL = "FL"
    GA = "GA"
    HI = "HI"
    IA = "IA"
    ID = "ID"
    IL = "IL"
    IN = "IN"
    KS = "KS"
    KY = "KY"
    LA = "LA"
    MA = "MA"
    MD = "MD"
    ME = "ME"
    MI = "MI"
    MN = "MN"
    MO = "MO"
    MS = "MS"
    MT = "MT"
    NC = "NC"
    ND = "ND"
    NE = "NE"
    NH = "NH"
    NJ = "NJ"
    NM = "NM"
    NV = "NV"
    NY = "NY"
    OH = "OH"
    OK = "OK"
    OR = "OR"
    PA = "PA"
    RI = "RI"
    SC = "SC"
    SD = "SD"
    TN = "TN"
    TX = "TX"
    UT = "UT"
    VA = "VA"
    VT = "VT"
    WA = "WA"
    WI = "WI"
    WV = "WV"
    WY = "WY"


class SizeCategory(Enum):
    UNKNOWN = 0
    SMALL = 1  # 1-99
    MEDIUM = 2  # 100-999
    LARGE = 3  # 1,000-9,999
    VERY_LARGE = 4  # 10,000+


class SizeDetails(BaseModel):
    size_text: List[str] = Field(
        ...,
        description="List of text descriptors for the number of people who participated in the event.",
    )
    size_exact: Optional[int] = Field(
        None, description="Exact number of participants, if reported."
    )
    size_estimate: int = Field(
        ...,
        description="Your best guess at the estimated number of participants based on the entire article.",
    )
    size_cat: SizeCategory = Field(
        SizeCategory.UNKNOWN,
        description="Categorical indicator of crowd size. 0 = unknown; 1 = 1-99; 2 = 100-999; 3 = 1,000-9,999; 4 = 10,000+.",
    )


class LocationDetails(BaseModel):
    city: str = Field(..., description="The city where the protest took place.")
    state_abbreviation: StateAB = Field(
        ...,
        description="The two-letter abbreviation of the state where the protest took place, such as NY or CA.",
    )
    neighborhood: Optional[str] = Field(
        None,
        description="The neighborhood where the protest took place, if applicable.",
    )
    moved: bool = Field(
        ...,
        description="Indicates whether the protest moved from one location to another.",
    )


class DateDetails(BaseModel):
    event_date: date = Field(
        ...,
        description="Date of the protest. Pay attention to dates mentioned in the article and words such as ‘yesterday,’ ‘last week,’ and ‘Monday.’",
    )
    day_of_week: WeekDay = Field(
        ...,
        description="The day of the week the protest occurred, such as Monday or Thursday.",
    )
    date_text: List[str] = Field(
        ...,
        description="List of text descriptors for the protest date, such as 'yesterday', 'last week', or 'Monday' .",
    )


class ParticipantDetails(BaseModel):
    organizations: List[str] = Field(
        default=[],
        description="Names of organizations that participated in the protest event. Exclude targets or other organizations mentioned but not protesting.  Organizational participation can take many forms, from organizing and leading the event to sponsoring or co-sponsoring it to providing one or more speakers for it to just showing up to the event as a recognizable presence.  ",
    )
    advocates: List[str] = Field(
        default=[], description="The names of individuals who organized."
    )
    participant_type: List[str] = Field(
        default=[],
        description="Descriptors of participants in the event, such as students, nurses, or local residents.  Record words or phrases describing the participants in the event. The goal is to capture as much information as possible about the kinds of people who participated, as distinct from any organizations they represent or belong to. ",
    )


class Protest(BaseModel):
    protest_article: bool = Field(
        False,
        description="Indicates if the article describes a protest against police brutality.",
    )
    summary: str = Field(
        ...,
        description="A focused summary of the article focusing on the protest details.",
    )
    location: LocationDetails = Field(..., description="Location of the protest.")
    size: SizeDetails = Field(..., description="Size of the protest.")
    participants: ParticipantDetails = Field(
        ..., description="Organizations and participants in the protest."
    )
    event_date: DateDetails = Field(
        ...,
        description="Date of the protest. Pay attention to dates mentioned in the article and words such as ‘yesterday,’ ‘last week,’ and ‘Monday.’",
    )
article = {
    "text": """COLUMBUS, Ohio (WCMH) – Yesterday was a national day of protest, and Columbus recognized the day when dozens of families gathered at the Ohio Statehouse to protest police brutality.

Protesters were asking for accountability and justice by sharing how they lost their loved ones, while organizers said the protest was about telling their stories in more than one way.

“We know that more than 1,200 Ohioans have been lost to police violence since the year 2000,” Ohio Families United for Political Action and Change (OFUPAC) Organizing Director Elaine Schleiffer said. “We wanted to represent the loss that that is, the empty shoes, that there’s no replacing those family members.”

OFUPAC is a non-profit organization that unites families who have lost loved ones in officer-involved shootings.

For many of those who turned out to yesterday’s protest, the issue hits close to home. Sabrina Jordan lost her son in an officer-involved shooting in 2017 just outside of Dayton.

“We’re just here also, to, like, celebrate and love each other,” Jordan, who is also OFUPAC’s founder, said. “You know, connect.”

Tania Hudson’s son was fatally shot by a Columbus police officer in 2015. 

“We’re asking accountability,” she said. “Officers be drug tested when they’re involved in a shooting, alcohol test. We understand that they have trauma and drama, too.”

The city’s police union, the Fraternal Order of Police (FOP), said there is already accountability in place.

“Accountability? How much more accountability can they ask for,” FOP Executive Vice President Brian Steel said. “We have an internal affairs. We have an inspector general’s office. We’re investigated by BCI in, say, a police-involved shooting, in a grand jury of our peers. There’s literally no more accountability that can be put on police officers today.”

“Accountability is pretty much all that we can ask for,” Hudson said. “We can’t say justice – ours is gone. There will never be justice for us, but we’re out here trying to save other people’s lives. That’s why we’re constantly out here.”

Protestors also mentioned their frustration with Marsy’s Law, which was originally passed to protect the victims of violent crimes, but which was extended to allow law enforcement departments to shield officers’ names when they are involved in a shooting. Protesters think this shouldn’t be the case while Steel said it’s an important protection for officers who are victims of violent crimes.
""",
    "headline": "Statehouse protest calls for end to police brutality",
    "publication-date": "2023-10-22",
    "source": "WCMH",
}
def get_protest_details(article):
    client = OpenAI(
        max_retries=3,
        timeout=20.0,
    )

    messages = [
        {
            "role": "system",
            "content": "You are a helpful assistant that extracts summaries of newspaper articles about political protests as JSON for a database. ",
        },
        {
            "role": "user",
            "content": f"""Extract information about the details about a protest from the following article.
      Only use information from the article.

      {article}
      
      """,
        },
    ]

    completion = client.chat.completions.create(
        model="gpt-3.5-turbo",  # model = 'gpt-4-turbo-preview',
        functions=[
            {
                "name": "protest_details",
                "description": "Extract insights from media article about protest.",
                "parameters": Protest.model_json_schema(),
            }
        ],
        n=1,
        messages=messages,
    )

    r = json.loads(completion.choices[0].message.function_call.arguments)
    return r
    return pd.DataFrame(
        [json.loads(c.message.function_call.arguments) for c in completion.choices]
    )
r = get_protest_details(article)
df = pd.json_normalize(
    r, sep="_"
)  # It is returning some nested dictionaries, so I can't use the normal pd.from_json
df
summary location_city location_state_abbreviation location_neighborhood location_moved size_size_text size_size_exact size_size_estimate size_size_cat participants_organizations participants_advocates participants_participant_type event_date_event_date event_date_day_of_week event_date_date_text
0 Yesterday was a national day of protest in Col... Columbus OH None False [dozens, families] None 50 1 [Ohio Families United for Political Action and... [] [] 2023-10-21 Saturday [yesterday]

Estimated cost:

Protest.model_json_schema()
{'$defs': {'DateDetails': {'properties': {'event_date': {'description': 'Date of the protest. Pay attention to dates mentioned in the article and words such as ‘yesterday,’ ‘last week,’ and ‘Monday.’',
     'format': 'date',
     'title': 'Event Date',
     'type': 'string'},
    'day_of_week': {'allOf': [{'$ref': '#/$defs/WeekDay'}],
     'description': 'The day of the week the protest occurred, such as Monday or Thursday.'},
    'date_text': {'description': "List of text descriptors for the protest date, such as 'yesterday', 'last week', or 'Monday' .",
     'items': {'type': 'string'},
     'title': 'Date Text',
     'type': 'array'}},
   'required': ['event_date', 'day_of_week', 'date_text'],
   'title': 'DateDetails',
   'type': 'object'},
  'LocationDetails': {'properties': {'city': {'description': 'The city where the protest took place.',
     'title': 'City',
     'type': 'string'},
    'state_abbreviation': {'allOf': [{'$ref': '#/$defs/StateAB'}],
     'description': 'The two-letter abbreviation of the state where the protest took place, such as NY or CA.'},
    'neighborhood': {'anyOf': [{'type': 'string'}, {'type': 'null'}],
     'default': None,
     'description': 'The neighborhood where the protest took place, if applicable.',
     'title': 'Neighborhood'},
    'moved': {'description': 'Indicates whether the protest moved from one location to another.',
     'title': 'Moved',
     'type': 'boolean'}},
   'required': ['city', 'state_abbreviation', 'moved'],
   'title': 'LocationDetails',
   'type': 'object'},
  'ParticipantDetails': {'properties': {'organizations': {'default': [],
     'description': 'Names of organizations that participated in the protest event. Exclude targets or other organizations mentioned but not protesting.  Organizational participation can take many forms, from organizing and leading the event to sponsoring or co-sponsoring it to providing one or more speakers for it to just showing up to the event as a recognizable presence.  ',
     'items': {'type': 'string'},
     'title': 'Organizations',
     'type': 'array'},
    'advocates': {'default': [],
     'description': 'The names of individuals who organized.',
     'items': {'type': 'string'},
     'title': 'Advocates',
     'type': 'array'},
    'participant_type': {'default': [],
     'description': 'Descriptors of participants in the event, such as students, nurses, or local residents.  Record words or phrases describing the participants in the event. The goal is to capture as much information as possible about the kinds of people who participated, as distinct from any organizations they represent or belong to. ',
     'items': {'type': 'string'},
     'title': 'Participant Type',
     'type': 'array'}},
   'title': 'ParticipantDetails',
   'type': 'object'},
  'SizeCategory': {'enum': [0, 1, 2, 3, 4],
   'title': 'SizeCategory',
   'type': 'integer'},
  'SizeDetails': {'properties': {'size_text': {'description': 'List of text descriptors for the number of people who participated in the event.',
     'items': {'type': 'string'},
     'title': 'Size Text',
     'type': 'array'},
    'size_exact': {'anyOf': [{'type': 'integer'}, {'type': 'null'}],
     'default': None,
     'description': 'Exact number of participants, if reported.',
     'title': 'Size Exact'},
    'size_estimate': {'description': 'Your best guess at the estimated number of participants based on the entire article.',
     'title': 'Size Estimate',
     'type': 'integer'},
    'size_cat': {'allOf': [{'$ref': '#/$defs/SizeCategory'}],
     'default': 0,
     'description': 'Categorical indicator of crowd size. 0 = unknown; 1 = 1-99; 2 = 100-999; 3 = 1,000-9,999; 4 = 10,000+.'}},
   'required': ['size_text', 'size_estimate'],
   'title': 'SizeDetails',
   'type': 'object'},
  'StateAB': {'enum': ['AK',
    'AL',
    'AR',
    'AZ',
    'CA',
    'CO',
    'CT',
    'DC',
    'DE',
    'FL',
    'GA',
    'HI',
    'IA',
    'ID',
    'IL',
    'IN',
    'KS',
    'KY',
    'LA',
    'MA',
    'MD',
    'ME',
    'MI',
    'MN',
    'MO',
    'MS',
    'MT',
    'NC',
    'ND',
    'NE',
    'NH',
    'NJ',
    'NM',
    'NV',
    'NY',
    'OH',
    'OK',
    'OR',
    'PA',
    'RI',
    'SC',
    'SD',
    'TN',
    'TX',
    'UT',
    'VA',
    'VT',
    'WA',
    'WI',
    'WV',
    'WY'],
   'title': 'StateAB',
   'type': 'string'},
  'WeekDay': {'enum': ['Monday',
    'Tuesday',
    'Wednesday',
    'Thursday',
    'Friday',
    'Saturday',
    'Sunday'],
   'title': 'WeekDay',
   'type': 'string'}},
 'properties': {'protest_article': {'default': False,
   'description': 'Indicates if the article describes a protest against police brutality.',
   'title': 'Protest Article',
   'type': 'boolean'},
  'summary': {'description': 'A focused summary of the article focusing on the protest details.',
   'title': 'Summary',
   'type': 'string'},
  'location': {'allOf': [{'$ref': '#/$defs/LocationDetails'}],
   'description': 'Location of the protest.'},
  'size': {'allOf': [{'$ref': '#/$defs/SizeDetails'}],
   'description': 'Size of the protest.'},
  'participants': {'allOf': [{'$ref': '#/$defs/ParticipantDetails'}],
   'description': 'Organizations and participants in the protest.'},
  'event_date': {'allOf': [{'$ref': '#/$defs/DateDetails'}],
   'description': 'Date of the protest. Pay attention to dates mentioned in the article and words such as ‘yesterday,’ ‘last week,’ and ‘Monday.’'}},
 'required': ['summary', 'location', 'size', 'participants', 'event_date'],
 'title': 'Protest',
 'type': 'object'}