import numpy as np import nltk from nltk.tokenize import wordtokenize, senttokenize from nltk.corpus import stopwords from sklearn.metrics.pairwise import cosine_similarity import re from collections import defaultdict import string
try: nltk.data.find('tokenizers/punkt') except LookupError: nltk.download('punkt')
try: nltk.data.find('corpora/stopwords') except LookupError: nltk.download('stopwords')
class GloVeKeywordExtractor: def init(self, glovefilepath): """ Initialize with GloVe embeddings file path Download GloVe from: https://nlp.stanford.edu/projects/glove/ """ self.embeddings = {} self.loadgloveembeddings(glovefilepath) self.stop_words = set(stopwords.words('english'))
def load_glove_embeddings(self, file_path):
"""Load GloVe embeddings from file"""
print("Loading GloVe embeddings...")
with open(file_path, 'r', encoding='utf-8') as f:
for line in f:
values = line.split()
word = values[0]
vector = np.array(values[1:], dtype='float32')
self.embeddings[word] = vector
print(f"Loaded {len(self.embeddings)} word embeddings")
def preprocess_text(self, text):
"""Clean and tokenize text"""
# Remove extra whitespace and normalize
text = re.sub(r'\s+', ' ', text).strip()
# Tokenize into sentences
sentences = sent_tokenize(text)
return sentences
def get_word_embedding(self, word):
"""Get embedding for a word (case-insensitive)"""
word = word.lower()
return self.embeddings.get(word, None)
def find_similar_words(self, keyword, top_k=10, similarity_threshold=0.5):
"""Find words similar to the keyword using cosine similarity"""
keyword_embedding = self.get_word_embedding(keyword)
if keyword_embedding is None:
print(f"Warning: '{keyword}' not found in embeddings")
return []
similarities = []
for word, embedding in self.embeddings.items():
if word != keyword.lower():
similarity = cosine_similarity(
keyword_embedding.reshape(1, -1),
embedding.reshape(1, -1)
)[0][0]
if similarity >= similarity_threshold:
similarities.append((word, similarity))
# Sort by similarity and return top_k
similarities.sort(key=lambda x: x[1], reverse=True)
return similarities[:top_k]
def extract_keywords_from_text(self, text, min_word_length=3):
"""Extract potential keywords from text"""
words = word_tokenize(text.lower())
# Filter out stopwords, punctuation, and short words
keywords = [
word for word in words
if (word not in self.stop_words and
word not in string.punctuation and
len(word) >= min_word_length and
word.isalpha() and
word in self.embeddings)
]
return list(set(keywords)) # Remove duplicates
def find_related_keywords(self, text, seed_keywords=None, top_k_similar=5):
"""
Find related keywords in the text
If seed_keywords provided, find words similar to those
Otherwise, extract keywords from text and find their relationships
"""
if seed_keywords is None:
# Extract keywords from the text itself
extracted_keywords = self.extract_keywords_from_text(text)
seed_keywords = extracted_keywords[:10] # Use top 10 as seeds
related_keywords = defaultdict(list)
for keyword in seed_keywords:
similar_words = self.find_similar_words(keyword, top_k_similar)
# Filter to only include words that appear in the text
text_words = set(word.lower() for word in word_tokenize(text))
relevant_similar = [
(word, score) for word, score in similar_words
if word in text_words
]
related_keywords[keyword] = relevant_similar
return dict(related_keywords)
def extract_relevant_quotes(self, text, keywords, context_window=1):
"""
Extract sentences/quotes that contain the keywords or related words
context_window: number of sentences before/after to include
"""
sentences = self.preprocess_text(text)
relevant_quotes = defaultdict(list)
# Create a set of all keywords and related words
all_relevant_words = set()
if isinstance(keywords, dict):
# keywords is the output from find_related_keywords
for keyword, related in keywords.items():
all_relevant_words.add(keyword.lower())
for word, _ in related:
all_relevant_words.add(word.lower())
else:
# keywords is a simple list
all_relevant_words = set(word.lower() for word in keywords)
for i, sentence in enumerate(sentences):
sentence_words = set(word.lower() for word in word_tokenize(sentence))
# Check if sentence contains any relevant words
if sentence_words.intersection(all_relevant_words):
# Determine which keywords this sentence is relevant to
matching_keywords = []
if isinstance(keywords, dict):
for keyword, related in keywords.items():
keyword_set = {keyword.lower()}
keyword_set.update(word for word, _ in related)
if sentence_words.intersection(keyword_set):
matching_keywords.append(keyword)
else:
for keyword in keywords:
if keyword.lower() in sentence_words:
matching_keywords.append(keyword)
# Extract context window
start_idx = max(0, i - context_window)
end_idx = min(len(sentences), i + context_window + 1)
context = ' '.join(sentences[start_idx:end_idx])
for keyword in matching_keywords:
relevant_quotes[keyword].append({
'quote': sentence,
'context': context,
'sentence_index': i
})
return dict(relevant_quotes)
def analyze_text(self, text, seed_keywords=None, similarity_threshold=0.6, top_k_similar=5):
"""
Complete analysis: find related keywords and extract relevant quotes
"""
print("Analyzing text for keywords and quotes...")
# Step 1: Find related keywords
related_keywords = self.find_related_keywords(
text, seed_keywords, top_k_similar
)
# Step 2: Extract relevant quotes
relevant_quotes = self.extract_relevant_quotes(text, related_keywords)
return {
'related_keywords': related_keywords,
'relevant_quotes': relevant_quotes
}
def print_results(self, results):
"""Pretty print the analysis results"""
print("\n" + "="**60)
print("KEYWORD ANALYSIS RESULTS")
print("="**60)
print("\nπ RELATED KEYWORDS:")
print("-" ** 30)
for keyword, related in results['related_keywords'].items():
print(f"\n'{keyword.upper()}' is related to:")
for word, similarity in related:
print(f" β’ {word} (similarity: {similarity:.3f})")
print("\n\n㪠RELEVANT QUOTES:")
print("-" ** 30)
for keyword, quotes in results['relevant_quotes'].items():
print(f"\nπ Quotes for '{keyword.upper()}' ({len(quotes)} found):")
for i, quote_data in enumerate(quotes, 1):
print(f"\n Quote {i}:")
print(f" \"{quote_data['quote']}\"")
if quote_data['context'] != quote_data['quote']:
print(f" Context: \"{quote_data['context']}\"")
if name == "main": # Initialize the extractor (you need to provide the path to your GloVe file) # Download from: https://nlp.stanford.edu/data/glove.6B.zip
# extractor = GloVeKeywordExtractor('path/to/glove.6B.300d.txt')
# Example text
sample_text = """
Artificial intelligence has revolutionized the way we approach machine learning and data science.
Deep learning algorithms, particularly neural networks, have shown remarkable performance in
computer vision tasks. Natural language processing has also benefited greatly from these advances.
The technology behind autonomous vehicles relies heavily on AI and machine learning techniques.
Scientists are now exploring quantum computing as the next frontier in computational power.
Robotics and automation are transforming manufacturing industries worldwide.
"""
# Example analysis (uncomment when you have GloVe file)
# results = extractor.analyze_text(
# sample_text,
# seed_keywords=['artificial', 'intelligence', 'learning', 'technology']
# )
# extractor.print_results(results)
print("GloVe Keyword Extractor initialized!")
print("To use:")
print("1. Download GloVe embeddings from <https://nlp.stanford.edu/projects/glove/>")
print("2. Initialize: extractor = GloVeKeywordExtractor('path/to/glove.file')")
print("3. Analyze: results = extractor.analyze_text(your_text, seed_keywords)")
print("4. Print: extractor.print_results(results)")
just hit you back on gmail
what I'm tryna say is that the issue isnt in the data structure
it's that the LLM's analysis misses things, hence, yielding an inaccurate result
what I'm tryna say is that the issue isnt in the data structure
it's that the LLM's analysis misses things, hence, yielding an inaccurate result
how big is the context your passing to the LLM for analysis anyways?
what are you passing it? just the json?
what do you mean by context? is that the plaintiff's narrative?
what are you sending off for analysis
i'm passing the json, the prompt, and the plaintiff's narrative
im convinced it was a type-ing issue with the way openai handle structured outputs
because that prompt is WAY to small to lose context of whats going on
just to confirm, your email mentioned that the json you pasted was the output of the script right?
so it made up data for the "Kissing" category?
in the
# Example testimony (replace with actual testimony)
sample_testimony = """
He grabbed my breasts over my shirt and then put his hands under my clothes.
He also kissed me forcefully on the mouth. Later, he digitally penetrated me.
"""
LLMs are dumb and we must guide them
the output is as expected! thanks this looks promising
β€οΈ i got you bro
gonna refactor and run the other tests on it
no question is a dumb question when it comes to help from me
so just curious, this typing thing, is this inherent onlyto OpenAI or do others (like Ollama) do it too?
It has to do with how they trained the model. Some support structured outputs and have specific guides, most will have to do with types because the LLM needs direction on WHERE and WHAT so it can do the stuff like forming the questions based on your testimony.
As for who else offers that, I don't know.
there might be a model out there where it WAS trained to handle structured data while other models were not
Ollama allows you to run models you download from their list
(i've been experimenting with various models on ollama)
the new gpt-oss handles structured data
Check this out https://huggingface.co/docs/inference-providers/en/guides/structured-output
does that imply that each time i says "ollama run _" then it loads the entire model in-memory?
read that thing that will answer your question on local models
ok i'm learning a lot vocab and stuff, thanks!
the nomenclature is the hardest part of everything bro haha we've all been there, just be proud that your learning it
i've heard of hugging face, they are a company that basically allows users to download various models right?
i'm seeing models, spaces, datasets on their page
like if github and stackoverflow had a nice pretty baby
so possibly, one could download model from huggingface, and run it thru ollama?
yeah 100% you'd go to your powershell and type in wsl
then just do a google search to see if it is trained to do structured outputs
then read that type article from hugging face
the inference engines just cant run any model because of things called templates
templates govern how data is handled during training for start stop etc etc
you need to know how the template looks and whoever manages ollama or llama.cpp or whatever has to add a template for it to properly infer from the model
in llama.cpp you can add your own templates if you can figure out what they used
their are some standards people follow but i dont rememebr the names of them off the top of my head
They have this so if you make your own model or train a lora and combine it with those models, and they dont support your template and you dont want to use transformers library up front, you can add your chat template
Hey Chris, is it ok that I'm using your openai api key? I tried to generate one but openai wouldn't let me unless I created a new org
sure -- just keep it safe or ask @James Scott for your own API key which would probably be better
The key is used for all our backend stuff so probably better to get your own key. We only get 10,000 API calls per day on our account.
So for any bigger data stuff. Checkout ollama and the gpt-oss and how to use openai library and ollama end points
Use this for like a few hundred a day here and there
oh i dont think i have that much, yeah thanks for letting me borrow it for this Abe rush order haha
https://www.youtube.com/watch?v=OVqe6GTrDFM
https://huggingface.co/docs/peft/en/index