Josh Josue (jjosue@shield-legal.com)
2025-06-25 13:14:58

πŸ‘‹

Chris Krecicki (ckrecicki@shield-legal.com)
2025-06-25 14:16:59

import numpy as np import nltk from nltk.tokenize import wordtokenize, senttokenize from nltk.corpus import stopwords from sklearn.metrics.pairwise import cosine_similarity import re from collections import defaultdict import string

Download required NLTK data

try: nltk.data.find('tokenizers/punkt') except LookupError: nltk.download('punkt')

try: nltk.data.find('corpora/stopwords') except LookupError: nltk.download('stopwords')

class GloVeKeywordExtractor: def init(self, glovefilepath): """ Initialize with GloVe embeddings file path Download GloVe from: https://nlp.stanford.edu/projects/glove/ """ self.embeddings = {} self.loadgloveembeddings(glovefilepath) self.stop_words = set(stopwords.words('english'))

def load_glove_embeddings(self, file_path):
    """Load GloVe embeddings from file"""
    print("Loading GloVe embeddings...")
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.array(values[1:], dtype='float32')
            self.embeddings[word] = vector
    print(f"Loaded {len(self.embeddings)} word embeddings")

def preprocess_text(self, text):
    """Clean and tokenize text"""
    # Remove extra whitespace and normalize
    text = re.sub(r'\s+', ' ', text).strip()
    # Tokenize into sentences
    sentences = sent_tokenize(text)
    return sentences

def get_word_embedding(self, word):
    """Get embedding for a word (case-insensitive)"""
    word = word.lower()
    return self.embeddings.get(word, None)

def find_similar_words(self, keyword, top_k=10, similarity_threshold=0.5):
    """Find words similar to the keyword using cosine similarity"""
    keyword_embedding = self.get_word_embedding(keyword)
    if keyword_embedding is None:
        print(f"Warning: '{keyword}' not found in embeddings")
        return []

    similarities = []
    for word, embedding in self.embeddings.items():
        if word != keyword.lower():
            similarity = cosine_similarity(
                keyword_embedding.reshape(1, -1),
                embedding.reshape(1, -1)
            )[0][0]
            if similarity >= similarity_threshold:
                similarities.append((word, similarity))

    # Sort by similarity and return top_k
    similarities.sort(key=lambda x: x[1], reverse=True)
    return similarities[:top_k]

def extract_keywords_from_text(self, text, min_word_length=3):
    """Extract potential keywords from text"""
    words = word_tokenize(text.lower())
    # Filter out stopwords, punctuation, and short words
    keywords = [
        word for word in words
        if (word not in self.stop_words and
            word not in string.punctuation and
            len(word) >= min_word_length and
            word.isalpha() and
            word in self.embeddings)
    ]
    return list(set(keywords))  # Remove duplicates

def find_related_keywords(self, text, seed_keywords=None, top_k_similar=5):
    """
    Find related keywords in the text
    If seed_keywords provided, find words similar to those
    Otherwise, extract keywords from text and find their relationships
    """
    if seed_keywords is None:
        # Extract keywords from the text itself
        extracted_keywords = self.extract_keywords_from_text(text)
        seed_keywords = extracted_keywords[:10]  # Use top 10 as seeds

    related_keywords = defaultdict(list)

    for keyword in seed_keywords:
        similar_words = self.find_similar_words(keyword, top_k_similar)
        # Filter to only include words that appear in the text
        text_words = set(word.lower() for word in word_tokenize(text))
        relevant_similar = [
            (word, score) for word, score in similar_words
            if word in text_words
        ]
        related_keywords[keyword] = relevant_similar

    return dict(related_keywords)

def extract_relevant_quotes(self, text, keywords, context_window=1):
    """
    Extract sentences/quotes that contain the keywords or related words
    context_window: number of sentences before/after to include
    """
    sentences = self.preprocess_text(text)
    relevant_quotes = defaultdict(list)

    # Create a set of all keywords and related words
    all_relevant_words = set()
    if isinstance(keywords, dict):
        # keywords is the output from find_related_keywords
        for keyword, related in keywords.items():
            all_relevant_words.add(keyword.lower())
            for word, _ in related:
                all_relevant_words.add(word.lower())
    else:
        # keywords is a simple list
        all_relevant_words = set(word.lower() for word in keywords)

    for i, sentence in enumerate(sentences):
        sentence_words = set(word.lower() for word in word_tokenize(sentence))

        # Check if sentence contains any relevant words
        if sentence_words.intersection(all_relevant_words):
            # Determine which keywords this sentence is relevant to
            matching_keywords = []

            if isinstance(keywords, dict):
                for keyword, related in keywords.items():
                    keyword_set = {keyword.lower()}
                    keyword_set.update(word for word, _ in related)
                    if sentence_words.intersection(keyword_set):
                        matching_keywords.append(keyword)
            else:
                for keyword in keywords:
                    if keyword.lower() in sentence_words:
                        matching_keywords.append(keyword)

            # Extract context window
            start_idx = max(0, i - context_window)
            end_idx = min(len(sentences), i + context_window + 1)
            context = ' '.join(sentences[start_idx:end_idx])

            for keyword in matching_keywords:
                relevant_quotes[keyword].append({
                    'quote': sentence,
                    'context': context,
                    'sentence_index': i
                })

    return dict(relevant_quotes)

def analyze_text(self, text, seed_keywords=None, similarity_threshold=0.6, top_k_similar=5):
    """
    Complete analysis: find related keywords and extract relevant quotes
    """
    print("Analyzing text for keywords and quotes...")

    # Step 1: Find related keywords
    related_keywords = self.find_related_keywords(
        text, seed_keywords, top_k_similar
    )

    # Step 2: Extract relevant quotes
    relevant_quotes = self.extract_relevant_quotes(text, related_keywords)

    return {
        'related_keywords': related_keywords,
        'relevant_quotes': relevant_quotes
    }

def print_results(self, results):
    """Pretty print the analysis results"""
    print("\n" + "="**60)
    print("KEYWORD ANALYSIS RESULTS")
    print("="**60)

    print("\nπŸ” RELATED KEYWORDS:")
    print("-" ** 30)
    for keyword, related in results['related_keywords'].items():
        print(f"\n'{keyword.upper()}' is related to:")
        for word, similarity in related:
            print(f"  β€’ {word} (similarity: {similarity:.3f})")

    print("\n\nπŸ’¬ RELEVANT QUOTES:")
    print("-" ** 30)
    for keyword, quotes in results['relevant_quotes'].items():
        print(f"\nπŸ“Œ Quotes for '{keyword.upper()}' ({len(quotes)} found):")
        for i, quote_data in enumerate(quotes, 1):
            print(f"\n  Quote {i}:")
            print(f"    \"{quote_data['quote']}\"")
            if quote_data['context'] != quote_data['quote']:
                print(f"    Context: \"{quote_data['context']}\"")

Example usage

if name == "main": # Initialize the extractor (you need to provide the path to your GloVe file) # Download from: https://nlp.stanford.edu/data/glove.6B.zip

# extractor = GloVeKeywordExtractor('path/to/glove.6B.300d.txt')

# Example text
sample_text = """
Artificial intelligence has revolutionized the way we approach machine learning and data science.
Deep learning algorithms, particularly neural networks, have shown remarkable performance in
computer vision tasks. Natural language processing has also benefited greatly from these advances.
The technology behind autonomous vehicles relies heavily on AI and machine learning techniques.
Scientists are now exploring quantum computing as the next frontier in computational power.
Robotics and automation are transforming manufacturing industries worldwide.
"""

# Example analysis (uncomment when you have GloVe file)
# results = extractor.analyze_text(
#     sample_text,
#     seed_keywords=['artificial', 'intelligence', 'learning', 'technology']
# )
# extractor.print_results(results)

print("GloVe Keyword Extractor initialized!")
print("To use:")
print("1. Download GloVe embeddings from <https://nlp.stanford.edu/projects/glove/>")
print("2. Initialize: extractor = GloVeKeywordExtractor('path/to/glove.file')")
print("3. Analyze: results = extractor.analyze_text(your_text, seed_keywords)")
print("4. Print: extractor.print_results(results)")
nlp.stanford.edu
πŸ‘ Josh Josue
Chris Krecicki (ckrecicki@shield-legal.com)
2025-07-01 15:53:13

https://www.youtube.com/watch?v=LCEmiRjPEtQ

YouTube
Y Combinator (https://www.youtube.com/@ycombinator)
Chris Krecicki (ckrecicki@shield-legal.com)
2025-08-06 15:18:03

just hit you back on gmail

πŸ‘ Josh Josue
Josh Josue (jjosue@shield-legal.com)
2025-08-06 15:19:19

thanks, just replied

Josh Josue (jjosue@shield-legal.com)
2025-08-06 15:20:12

what I'm tryna say is that the issue isnt in the data structure

it's that the LLM's analysis misses things, hence, yielding an inaccurate result

Josh Josue (jjosue@shield-legal.com)
2025-08-06 15:20:20

what I'm tryna say is that the issue isnt in the data structure

it's that the LLM's analysis misses things, hence, yielding an inaccurate result

Chris Krecicki (ckrecicki@shield-legal.com)
2025-08-06 15:21:28

try what I gave you

Chris Krecicki (ckrecicki@shield-legal.com)
2025-08-06 15:21:55

how big is the context your passing to the LLM for analysis anyways?

Chris Krecicki (ckrecicki@shield-legal.com)
2025-08-06 15:22:06

what are you passing it? just the json?

Josh Josue (jjosue@shield-legal.com)
2025-08-06 15:22:18

what do you mean by context? is that the plaintiff's narrative?

Chris Krecicki (ckrecicki@shield-legal.com)
2025-08-06 15:22:25

is it a pdf?

Josh Josue (jjosue@shield-legal.com)
2025-08-06 15:22:30

i see

Josh Josue (jjosue@shield-legal.com)
2025-08-06 15:22:31

no

Chris Krecicki (ckrecicki@shield-legal.com)
2025-08-06 15:22:40

what are you sending off for analysis

Josh Josue (jjosue@shield-legal.com)
2025-08-06 15:22:44

i'm passing the json, the prompt, and the plaintiff's narrative

Chris Krecicki (ckrecicki@shield-legal.com)
2025-08-06 15:22:44

like can i see it?

Josh Josue (jjosue@shield-legal.com)
2025-08-06 15:22:46

all string

Chris Krecicki (ckrecicki@shield-legal.com)
2025-08-06 15:22:55

send me the plantiffs narrative

Chris Krecicki (ckrecicki@shield-legal.com)
2025-08-06 15:23:05

email it, stick it in a json file

Josh Josue (jjosue@shield-legal.com)
2025-08-06 15:23:15

the one on the email is the narrative

Chris Krecicki (ckrecicki@shield-legal.com)
2025-08-06 15:23:23

oh thats it?

Josh Josue (jjosue@shield-legal.com)
2025-08-06 15:23:27

yep

Chris Krecicki (ckrecicki@shield-legal.com)
2025-08-06 15:23:27

the thing in the prompt?

Chris Krecicki (ckrecicki@shield-legal.com)
2025-08-06 15:23:36

did you see that file I sent?

Chris Krecicki (ckrecicki@shield-legal.com)
2025-08-06 15:23:41

the prompt should solve your issue

Chris Krecicki (ckrecicki@shield-legal.com)
2025-08-06 15:23:47

and the typing

Chris Krecicki (ckrecicki@shield-legal.com)
2025-08-06 15:24:22

im convinced it was a type-ing issue with the way openai handle structured outputs

Chris Krecicki (ckrecicki@shield-legal.com)
2025-08-06 15:24:35

because that prompt is WAY to small to lose context of whats going on

Josh Josue (jjosue@shield-legal.com)
2025-08-06 15:24:50

just to confirm, your email mentioned that the json you pasted was the output of the script right?

Chris Krecicki (ckrecicki@shield-legal.com)
2025-08-06 15:25:07

yes

Josh Josue (jjosue@shield-legal.com)
2025-08-06 15:25:31

so it made up data for the "Kissing" category?

Chris Krecicki (ckrecicki@shield-legal.com)
2025-08-06 15:25:55

everything

Chris Krecicki (ckrecicki@shield-legal.com)
2025-08-06 15:26:26

OH

Chris Krecicki (ckrecicki@shield-legal.com)
2025-08-06 15:26:29

i see what youre talking about

Chris Krecicki (ckrecicki@shield-legal.com)
2025-08-06 15:26:34

in the # Example testimony (replace with actual testimony) sample_testimony = """ He grabbed my breasts over my shirt and then put his hands under my clothes. He also kissed me forcefully on the mouth. Later, he digitally penetrated me. """

Chris Krecicki (ckrecicki@shield-legal.com)
2025-08-06 15:26:42

replace it with your real testimony

Josh Josue (jjosue@shield-legal.com)
2025-08-06 15:26:49

oh ok gotcha

Chris Krecicki (ckrecicki@shield-legal.com)
2025-08-06 15:26:50

and give it a whirl

Josh Josue (jjosue@shield-legal.com)
2025-08-06 15:26:56

lemme run it

Chris Krecicki (ckrecicki@shield-legal.com)
2025-08-06 15:26:57

haha thats why i left the comment

Chris Krecicki (ckrecicki@shield-legal.com)
2025-08-06 15:26:59

πŸ˜›

πŸ˜† Josh Josue
Chris Krecicki (ckrecicki@shield-legal.com)
2025-08-06 15:27:25

ill be here

Chris Krecicki (ckrecicki@shield-legal.com)
2025-08-06 15:28:38

LLMs are dumb and we must guide them

πŸ’― Josh Josue
Chris Krecicki (ckrecicki@shield-legal.com)
2025-08-06 15:28:44

and then they are super powers

Josh Josue (jjosue@shield-legal.com)
2025-08-06 15:32:07

the output is as expected! thanks this looks promising

Chris Krecicki (ckrecicki@shield-legal.com)
2025-08-06 15:32:17

❀️ i got you bro

πŸ™ Josh Josue
Josh Josue (jjosue@shield-legal.com)
2025-08-06 15:32:18

gonna refactor and run the other tests on it

Chris Krecicki (ckrecicki@shield-legal.com)
2025-08-06 15:32:26

let me know if you need anything

Chris Krecicki (ckrecicki@shield-legal.com)
2025-08-06 15:40:17

no question is a dumb question when it comes to help from me

πŸ‘ Josh Josue
Josh Josue (jjosue@shield-legal.com)
2025-08-06 15:40:23

so just curious, this typing thing, is this inherent onlyto OpenAI or do others (like Ollama) do it too?

Chris Krecicki (ckrecicki@shield-legal.com)
2025-08-06 15:41:40

It has to do with how they trained the model. Some support structured outputs and have specific guides, most will have to do with types because the LLM needs direction on WHERE and WHAT so it can do the stuff like forming the questions based on your testimony.

Chris Krecicki (ckrecicki@shield-legal.com)
2025-08-06 15:41:56

As for who else offers that, I don't know.

Josh Josue (jjosue@shield-legal.com)
2025-08-06 15:41:59

ohh i see

Josh Josue (jjosue@shield-legal.com)
2025-08-06 15:42:05

so for example in Ollama

Chris Krecicki (ckrecicki@shield-legal.com)
2025-08-06 15:42:16

Ollama is the inference engine.

Josh Josue (jjosue@shield-legal.com)
2025-08-06 15:42:26

there might be a model out there where it WAS trained to handle structured data while other models were not

βœ… Chris Krecicki
Chris Krecicki (ckrecicki@shield-legal.com)
2025-08-06 15:42:31

Ollama allows you to run models you download from their list

πŸ‘ Josh Josue
Josh Josue (jjosue@shield-legal.com)
2025-08-06 15:42:41

(i've been experimenting with various models on ollama)

Chris Krecicki (ckrecicki@shield-legal.com)
2025-08-06 15:43:03

the new gpt-oss handles structured data

Chris Krecicki (ckrecicki@shield-legal.com)
2025-08-06 15:43:10

but it needs 16GB ram minimum

Chris Krecicki (ckrecicki@shield-legal.com)
2025-08-06 15:43:36

Check this out https://huggingface.co/docs/inference-providers/en/guides/structured-output

huggingface.co
πŸ‘ Josh Josue
Josh Josue (jjosue@shield-legal.com)
2025-08-06 15:43:53

does that imply that each time i says "ollama run _" then it loads the entire model in-memory?

Chris Krecicki (ckrecicki@shield-legal.com)
2025-08-06 15:44:04

yeah

Josh Josue (jjosue@shield-legal.com)
2025-08-06 15:44:09

gotcha

Chris Krecicki (ckrecicki@shield-legal.com)
2025-08-06 15:44:14

read that thing that will answer your question on local models

Josh Josue (jjosue@shield-legal.com)
2025-08-06 15:44:19

ok i'm learning a lot vocab and stuff, thanks!

Chris Krecicki (ckrecicki@shield-legal.com)
2025-08-06 15:44:42

the nomenclature is the hardest part of everything bro haha we've all been there, just be proud that your learning it

Chris Krecicki (ckrecicki@shield-legal.com)
2025-08-06 15:44:51

glad i can help teach

Josh Josue (jjosue@shield-legal.com)
2025-08-06 15:45:20

i've heard of hugging face, they are a company that basically allows users to download various models right?

Chris Krecicki (ckrecicki@shield-legal.com)
2025-08-06 15:45:54

their the github of the AI space

Chris Krecicki (ckrecicki@shield-legal.com)
2025-08-06 15:45:58

they're

Chris Krecicki (ckrecicki@shield-legal.com)
2025-08-06 15:46:08

but better

Josh Josue (jjosue@shield-legal.com)
2025-08-06 15:46:22

i'm seeing models, spaces, datasets on their page

Chris Krecicki (ckrecicki@shield-legal.com)
2025-08-06 15:46:25

like if github and stackoverflow had a nice pretty baby

Chris Krecicki (ckrecicki@shield-legal.com)
2025-08-06 15:46:28

but for AI

Josh Josue (jjosue@shield-legal.com)
2025-08-06 15:46:37

oh ok

Chris Krecicki (ckrecicki@shield-legal.com)
2025-08-06 15:46:39

lol

Josh Josue (jjosue@shield-legal.com)
2025-08-06 15:46:54

so possibly, one could download model from huggingface, and run it thru ollama?

Chris Krecicki (ckrecicki@shield-legal.com)
2025-08-06 15:47:13

yeah 100% you'd go to your powershell and type in wsl

Chris Krecicki (ckrecicki@shield-legal.com)
2025-08-06 15:47:18

then do ollama list

πŸ‘ Josh Josue
Chris Krecicki (ckrecicki@shield-legal.com)
2025-08-06 15:47:25

and see available models

Chris Krecicki (ckrecicki@shield-legal.com)
2025-08-06 15:48:07

then just do a google search to see if it is trained to do structured outputs

Chris Krecicki (ckrecicki@shield-legal.com)
2025-08-06 15:48:15

then read that type article from hugging face

Chris Krecicki (ckrecicki@shield-legal.com)
2025-08-06 15:48:21

and give it a whirl

Chris Krecicki (ckrecicki@shield-legal.com)
2025-08-06 15:48:36

the inference engines just cant run any model because of things called templates

πŸ‘ Josh Josue
Josh Josue (jjosue@shield-legal.com)
2025-08-06 15:48:51

ahh so the plot thickens

βœ… Chris Krecicki
Josh Josue (jjosue@shield-legal.com)
2025-08-06 15:48:52

lol

🀣 Chris Krecicki
Chris Krecicki (ckrecicki@shield-legal.com)
2025-08-06 15:48:53

templates govern how data is handled during training for start stop etc etc

Chris Krecicki (ckrecicki@shield-legal.com)
2025-08-06 15:49:03

so when you run inference on it

Chris Krecicki (ckrecicki@shield-legal.com)
2025-08-06 15:49:26

you need to know how the template looks and whoever manages ollama or llama.cpp or whatever has to add a template for it to properly infer from the model

Chris Krecicki (ckrecicki@shield-legal.com)
2025-08-06 15:49:59

in llama.cpp you can add your own templates if you can figure out what they used

Chris Krecicki (ckrecicki@shield-legal.com)
2025-08-06 15:50:14

their are some standards people follow but i dont rememebr the names of them off the top of my head

Chris Krecicki (ckrecicki@shield-legal.com)
2025-08-06 15:50:31

take a peak at that

Chris Krecicki (ckrecicki@shield-legal.com)
2025-08-06 15:51:30

They have this so if you make your own model or train a lora and combine it with those models, and they dont support your template and you dont want to use transformers library up front, you can add your chat template

πŸ‘ Josh Josue
Josh Josue (jjosue@shield-legal.com)
2025-08-08 14:29:26

Hey Chris, is it ok that I'm using your openai api key? I tried to generate one but openai wouldn't let me unless I created a new org

Chris Krecicki (ckrecicki@shield-legal.com)
2025-08-08 16:59:59

sure -- just keep it safe or ask @James Scott for your own API key which would probably be better

Chris Krecicki (ckrecicki@shield-legal.com)
2025-08-08 17:00:32

The key is used for all our backend stuff so probably better to get your own key. We only get 10,000 API calls per day on our account.

Chris Krecicki (ckrecicki@shield-legal.com)
2025-08-08 17:00:52

So for any bigger data stuff. Checkout ollama and the gpt-oss and how to use openai library and ollama end points

Chris Krecicki (ckrecicki@shield-legal.com)
2025-08-08 17:05:39

Use this for like a few hundred a day here and there

Josh Josue (jjosue@shield-legal.com)
2025-08-08 17:49:27

oh i dont think i have that much, yeah thanks for letting me borrow it for this Abe rush order haha

Josh Josue (jjosue@shield-legal.com)
2025-08-08 17:49:31

i'll ask James for one

Chris Krecicki (ckrecicki@shield-legal.com)
2025-08-21 11:55:04
Chris Krecicki (ckrecicki@shield-legal.com)
2025-08-21 11:57:52

https://www.youtube.com/watch?v=OVqe6GTrDFM

YouTube
Prompt Engineering (https://www.youtube.com/@engineerprompt)
πŸ‘ Josh Josue
Chris Krecicki (ckrecicki@shield-legal.com)
2025-08-21 11:59:32

https://huggingface.co/docs/peft/en/index

huggingface.co
πŸ‘ Josh Josue