Reformat code examples on llm university text-representation section (#…

…355) Co-authored-by: Max Shkutnyk <max@lightsonsoftware.com> Co-authored-by: trentfowlercohere <141260477+trentfowlercohere@users.noreply.github.com>
cohere-ai · Jan 16, 2025 · 464caed · 464caed
1 parent 7af122a
commit 464caed
Show file tree

Hide file tree

Showing 13 changed files with 199 additions and 145 deletions.
diff --git a/...es/llm-university/intro-text-representation/classification-using-embeddings.mdx b/...es/llm-university/intro-text-representation/classification-using-embeddings.mdx
@@ -41,8 +41,8 @@ from sklearn.preprocessing import StandardScaler
 svm_classifier = make_pipeline(StandardScaler(), SVC())
 
 # Prepare the training features and label
-features = df_train['query_embeds'].tolist()
-label = df_train['intent']
+features = df_train["query_embeds"].tolist()
+label = df_train["intent"]
 
 # Fit the support vector machine
 svm_classifier.fit(features, label)
@@ -55,13 +55,13 @@ Once that is done, we’ll take the embeddings of the 9 data points, put them th
 
 # Prepare the test inputs
 df_test = df_test.copy()
-inputs = df_test['query_embeds'].tolist()
+inputs = df_test["query_embeds"].tolist()
 
 # Predict the labels
-df_test['intent_pred'] = svm_classifier.predict(inputs)
+df_test["intent_pred"] = svm_classifier.predict(inputs)
 
 # Compute the score
-score = svm_classifier.score(inputs, df_test['intent'])
+score = svm_classifier.score(inputs, df_test["intent"])
 print(f"Prediction accuracy is {100*score}%")
 ```
 

diff --git a/fern/pages/llm-university/intro-text-representation/classify-endpoint.mdx b/fern/pages/llm-university/intro-text-representation/classify-endpoint.mdx
@@ -44,40 +44,48 @@ The examples:
 from cohere import ClassifyExample
 
 examples = [
-  ClassifyExample(text="I'm so proud of you", label="positive"), 
-  ClassifyExample(text="What a great time to be alive", label="positive"), 
-  ClassifyExample(text="That's awesome work", label="positive"), 
-  ClassifyExample(text="The service was amazing", label="positive"), 
-  ClassifyExample(text="I love my family", label="positive"), 
-  ClassifyExample(text="They don't care about me", label="negative"), 
-  ClassifyExample(text="I hate this place", label="negative"), 
-  ClassifyExample(text="The most ridiculous thing I've ever heard", label="negative"), 
-  ClassifyExample(text="I am really frustrated", label="negative"), 
-  ClassifyExample(text="This is so unfair", label="negative"),
-  ClassifyExample(text="This made me think", label="neutral"), 
-  ClassifyExample(text="The good old days", label="neutral"), 
-  ClassifyExample(text="What's the difference", label="neutral"), 
-  ClassifyExample(text="You can't ignore this", label="neutral"), 
-  ClassifyExample(text="That's how I see it", label="neutral")            
+    ClassifyExample(text="I'm so proud of you", label="positive"),
+    ClassifyExample(
+        text="What a great time to be alive", label="positive"
+    ),
+    ClassifyExample(text="That's awesome work", label="positive"),
+    ClassifyExample(text="The service was amazing", label="positive"),
+    ClassifyExample(text="I love my family", label="positive"),
+    ClassifyExample(
+        text="They don't care about me", label="negative"
+    ),
+    ClassifyExample(text="I hate this place", label="negative"),
+    ClassifyExample(
+        text="The most ridiculous thing I've ever heard",
+        label="negative",
+    ),
+    ClassifyExample(text="I am really frustrated", label="negative"),
+    ClassifyExample(text="This is so unfair", label="negative"),
+    ClassifyExample(text="This made me think", label="neutral"),
+    ClassifyExample(text="The good old days", label="neutral"),
+    ClassifyExample(text="What's the difference", label="neutral"),
+    ClassifyExample(text="You can't ignore this", label="neutral"),
+    ClassifyExample(text="That's how I see it", label="neutral"),
 ]
 ```
 
 The inputs (we have twelve in this example):
 
 ```python PYTHON
-inputs=["Hello, world! What a beautiful day",
-        "It was a great time with great people",
-        "Great place to work",
-        "That was a wonderful evening",
-        "Maybe this is why",
-        "Let's start again",
-        "That's how I see it",
-        "These are all facts",
-        "This is the worst thing",
-        "I cannot stand this any longer",
-        "This is really annoying",
-        "I am just plain fed up"
-        ]
+inputs = [
+    "Hello, world! What a beautiful day",
+    "It was a great time with great people",
+    "Great place to work",
+    "That was a wonderful evening",
+    "Maybe this is why",
+    "Let's start again",
+    "That's how I see it",
+    "These are all facts",
+    "This is the worst thing",
+    "I cannot stand this any longer",
+    "This is really annoying",
+    "I am just plain fed up",
+]
 ```
 
 #### Get output
@@ -89,18 +97,19 @@ Putting everything together with the Classify endpoint looks like the following:
 ```python PYTHON
 co = cohere.Client(api_key)
 
+
 def classify_text(inputs, examples):
 
-  response = co.classify(
-    model='embed-english-v3.0',
-    inputs=inputs,
-    examples=examples)
-
-  classifications = response.classifications
-
-  return classifications
+    response = co.classify(
+        model="embed-english-v3.0", inputs=inputs, examples=examples
+    )
+
+    classifications = response.classifications
+
+    return classifications
+
 
-predictions = classify_text(inputs,examples)
+predictions = classify_text(inputs, examples)
 ```
 
 Together with the predicted class, the endpoint also returns the confidence value of the prediction (between 0 and 1). These confidence values are split among the classes, in this case three, in which the values add up to a total of 1. The classifier then selects the class with the highest confidence value as the “predicted class.” A high confidence value for the predicted class therefore indicates that the model is very confident of its prediction, and vice versa.

diff --git a/...pages/llm-university/intro-text-representation/clustering-hacker-news-posts.mdx b/...pages/llm-university/intro-text-representation/clustering-hacker-news-posts.mdx
@@ -57,9 +57,9 @@ The next step is to embed these titles so we can examine the dataset based on th
 As you've seen before, Cohere’s <a target="_blank" href="https://docs.cohere.ai/embed-reference?ref=txt.cohere.com&amp;__hstc=14363112.fb39cf5aec47995e64cd26603e2e04d9.1682489949734.1683512904818.1683517385804.31&amp;__hssc=14363112.98.1683517385804&amp;__hsfp=3640182760">embed endpoint</a> gives us vector representations from a large embedding language model specifically tuned for text embedding (as opposed to word embedding or text generation).
 
 ```python PYTHON
-embeds = co.embed(texts=list_of_posts,                  				
-                  model="small",
-                  truncate="LEFT").embeddings
+embeds = co.embed(
+    texts=list_of_posts, model="small", truncate="LEFT"
+).embeddings
 ```
 
 ### Plotting
@@ -167,9 +167,8 @@ We can use a hierarchical plot to better understand the hierarchy of the cluster
 For this plot, we use the hierarchy package from scipy:
 
 ```python PYTHON
-Z = hierarchy.linkage(kmeans_model.cluster_centers_, 'single')
-dn = hierarchy.dendrogram(Z, orientation='right',
-                         labels=label_list)
+Z = hierarchy.linkage(kmeans_model.cluster_centers_, "single")
+dn = hierarchy.dendrogram(Z, orientation="right", labels=label_list)
 ```
 
 Here’s how we can read this hierarchy (scanning it from right to left):

diff --git a/.../pages/llm-university/intro-text-representation/clustering-using-embeddings.mdx b/.../pages/llm-university/intro-text-representation/clustering-using-embeddings.mdx
@@ -31,16 +31,23 @@ from sklearn.cluster import KMeans
 
 # Pick the number of clusters
 df_clust = df_pc2.copy()
-n_clusters=2
+n_clusters = 2
 
 # Cluster the embeddings
 kmeans_model = KMeans(n_clusters=n_clusters, random_state=0)
 classes = kmeans_model.fit_predict(embeds).tolist()
-df_clust['cluster'] = (list(map(str,classes)))
+df_clust["cluster"] = list(map(str, classes))
 
 # Plot on a chart
 df_clust.columns = df_clust.columns.astype(str)
-generate_chart(df_clust.iloc[:sample],'0','1',lbl='on',color='cluster',title='Clustering with 2 Clusters')
+generate_chart(
+    df_clust.iloc[:sample],
+    "0",
+    "1",
+    lbl="on",
+    color="cluster",
+    title="Clustering with 2 Clusters",
+)
 ```
 
 The plot below shows the clusters that the algorithm returned. It looks to be spot on, where we have one cluster related to airline information and one cluster related to ground service information.

diff --git a/fern/pages/llm-university/intro-text-representation/clustering-with-embeddings.mdx b/fern/pages/llm-university/intro-text-representation/clustering-with-embeddings.mdx
@@ -37,8 +37,10 @@ We embed the documents using the same `get_embeddings()` function as before, but
 
 ```python PYTHON
 # Embed the text for clustering
-df['clustering_embeds'] = get_embeddings(df['query'].tolist(), input_type="clustering")
-embeds = np.array(df['clustering_embeds'].tolist())
+df["clustering_embeds"] = get_embeddings(
+    df["query"].tolist(), input_type="clustering"
+)
+embeds = np.array(df["clustering_embeds"].tolist())
 ```
 
 ### Step 2: Cluster the Embeddings
@@ -57,7 +59,7 @@ classes = kmeans_model.fit_predict(embeds).tolist()
 
 # Store the cluster assignments
 df_clust = df_pc2.copy()
-df_clust['cluster'] = (list(map(str,classes)))
+df_clust["cluster"] = list(map(str, classes))
 ```
 
 ### Step 3: Visualize the Results in a 2D Plot

diff --git a/fern/pages/llm-university/intro-text-representation/embed-endpoint.mdx b/fern/pages/llm-university/intro-text-representation/embed-endpoint.mdx
@@ -22,7 +22,10 @@ For the setup, please refer to the <a target="_blank" href="/docs/setting-up">Se
 The dataset we'll use is formed of 50 top search terms on the web about "Hello, World!". 
 
 ```python PYTHON
-df = pd.read_csv("https://github.com/cohere-ai/cohere-developer-experience/raw/main/notebooks/data/hello-world-kw.csv", names=["search_term"])
+df = pd.read_csv(
+    "https://github.com/cohere-ai/cohere-developer-experience/raw/main/notebooks/data/hello-world-kw.csv",
+    names=["search_term"],
+)
 df.head()
 ```
 
@@ -55,13 +58,15 @@ The code looks like this:
 
 ```python PYTHON
 def embed_text(texts):
-  output = co.embed(
-                model="embed-english-v3.0",
-    						input_type="search_document",
-                texts=texts)
-  embedding = output.embeddings
+    output = co.embed(
+        model="embed-english-v3.0",
+        input_type="search_document",
+        texts=texts,
+    )
+    embedding = output.embeddings
+
+    return embedding
 
-  return embedding
 
 df["search_term_embeds"] = embed_text(df["search_term"].tolist())
 ```
@@ -82,15 +87,16 @@ We can make use of the <a target="_blank" href="https://umap-learn.readthedocs.i
 ```python PYTHON
 # If you don't have umap installed, pleased run `pip install umap-learn` first!
 import umap
-embeds = list(df["search_term_embeds"]) 
+
+embeds = list(df["search_term_embeds"])
 
 # Compress the embeddings to 2 dimensions (UMAP's default reduction is to 2 dimensions)
-reducer = umap.UMAP(n_neighbors=49) 
+reducer = umap.UMAP(n_neighbors=49)
 umap_embeds = reducer.fit_transform(embeds)
 
 # Store the compressed embeddings in the dataframe/table
-df['x'] = umap_embeds[:,0]
-df['y'] = umap_embeds[:,1]
+df["x"] = umap_embeds[:, 0]
+df["y"] = umap_embeds[:, 1]
 ```
 
 You can then use any plotting library to visualize these compressed embeddings on a 2D plot.

diff --git a/fern/pages/llm-university/intro-text-representation/few-shot-classification.mdx b/fern/pages/llm-university/intro-text-representation/few-shot-classification.mdx
@@ -30,7 +30,7 @@ from cohere import ClassifyExample
 We also create a Cohere client.
 
 ```python PYTHON
-co = cohere.Client("COHERE_API_KEY") # Your Cohere API key
+co = cohere.Client("COHERE_API_KEY")  # Your Cohere API key
 ```
 
 ### Step 1: Prepare Examples and Input
@@ -55,38 +55,49 @@ Our sentiment analysis classifier has three classes with five examples each: “
 The examples:
 
 ```python PYTHON
-examples = [ClassifyExample(text="I’m so proud of you", label="positive"), 
-            ClassifyExample(text="What a great time to be alive", label="positive"), 
-            ClassifyExample(text="That’s awesome work", label="positive"), 
-            ClassifyExample(text="The service was amazing", label="positive"), 
-            ClassifyExample(text="I love my family", label="positive"), 
-            ClassifyExample(text="They don't care about me", label="negative"), 
-            ClassifyExample(text="I hate this place", label="negative"), 
-            ClassifyExample(text="The most ridiculous thing I've ever heard", label="negative"), 
-            ClassifyExample(text="I am really frustrated", label="negative"), 
-            ClassifyExample(text="This is so unfair", label="negative"),
-            ClassifyExample(text="This made me think", label="neutral"), 
-            ClassifyExample(text="The good old days", label="neutral"), 
-            ClassifyExample(text="What's the difference", label="neutral"), 
-            ClassifyExample(text="You can't ignore this", label="neutral"), 
-            ClassifyExample(text="That's how I see it", label="neutral")]
+examples = [
+    ClassifyExample(text="I’m so proud of you", label="positive"),
+    ClassifyExample(
+        text="What a great time to be alive", label="positive"
+    ),
+    ClassifyExample(text="That’s awesome work", label="positive"),
+    ClassifyExample(text="The service was amazing", label="positive"),
+    ClassifyExample(text="I love my family", label="positive"),
+    ClassifyExample(
+        text="They don't care about me", label="negative"
+    ),
+    ClassifyExample(text="I hate this place", label="negative"),
+    ClassifyExample(
+        text="The most ridiculous thing I've ever heard",
+        label="negative",
+    ),
+    ClassifyExample(text="I am really frustrated", label="negative"),
+    ClassifyExample(text="This is so unfair", label="negative"),
+    ClassifyExample(text="This made me think", label="neutral"),
+    ClassifyExample(text="The good old days", label="neutral"),
+    ClassifyExample(text="What's the difference", label="neutral"),
+    ClassifyExample(text="You can't ignore this", label="neutral"),
+    ClassifyExample(text="That's how I see it", label="neutral"),
+]
 ```
 
 The inputs (we have twelve in this example):
 
 ```python PYTHON
-inputs = ["Hello, world! What a beautiful day",
-          "It was a great time with great people",
-          "Great place to work",
-          "That was a wonderful evening",
-          "Maybe this is why",
-          "Let's start again",
-          "That's how I see it",
-          "These are all facts",
-          "This is the worst thing",
-          "I cannot stand this any longer",
-          "This is really annoying",
-          "I am just plain fed up"]
+inputs = [
+    "Hello, world! What a beautiful day",
+    "It was a great time with great people",
+    "Great place to work",
+    "That was a wonderful evening",
+    "Maybe this is why",
+    "Let's start again",
+    "That's how I see it",
+    "These are all facts",
+    "This is the worst thing",
+    "I cannot stand this any longer",
+    "This is really annoying",
+    "I am just plain fed up",
+]
 ```
 
 ### Step 2: Generate Predictions
@@ -108,14 +119,14 @@ def classify_text(inputs, examples):
     """
     # Classify text by calling the Classify endpoint
     response = co.classify(
-        model='embed-english-v2.0',
-        inputs=inputs,
-        examples=examples)
+        model="embed-english-v2.0", inputs=inputs, examples=examples
+    )
 
     classifications = response.classifications
 
     return classifications
 
+
 # Classify the inputs
 predictions = classify_text(inputs, examples)
 ```