Your First Analysis Pipeline¶
This guide walks you through building a complete sentiment analysis pipeline that scrapes reviews, analyzes them, and generates insights.
What We'll Build¶
A pipeline that:
- Scrapes reviews from Steam
- Analyzes sentiment for each review
- Detects emotions
- Generates an LLM-powered summary
- Exports results to JSON
Prerequisites¶
# Install with required extras
pip install sentimatrix[llm,scraping]
# Get a free Groq API key from https://console.groq.com
export GROQ_API_KEY="gsk_..."
Step 1: Basic Setup¶
Create a new file analyze_game.py:
import asyncio
import json
from datetime import datetime
from sentimatrix import Sentimatrix
from sentimatrix.config import SentimatrixConfig, LLMConfig
# Configuration
CONFIG = SentimatrixConfig(
llm=LLMConfig(
provider="groq",
model="llama-3.3-70b-versatile"
)
)
# Game to analyze
GAME_URL = "https://store.steampowered.com/app/1245620/ELDEN_RING/"
MAX_REVIEWS = 100
Step 2: Scrape Reviews¶
Add the scraping function:
async def scrape_game_reviews(sm: Sentimatrix) -> list:
"""Scrape reviews from Steam."""
print(f"Scraping reviews from Steam...")
reviews = await sm.scrape_reviews(
url=GAME_URL,
platform="steam",
max_reviews=MAX_REVIEWS
)
print(f"Scraped {len(reviews)} reviews")
return reviews
Step 3: Analyze Sentiment¶
Add sentiment analysis:
async def analyze_sentiments(sm: Sentimatrix, reviews: list) -> list:
"""Analyze sentiment for all reviews."""
print("Analyzing sentiments...")
# Extract review texts
texts = [r.text for r in reviews]
# Batch analysis for efficiency
results = await sm.analyze_batch(texts)
# Combine reviews with results
analyzed = []
for review, result in zip(reviews, results):
analyzed.append({
"text": review.text,
"rating": review.rating,
"helpful_count": review.helpful_count,
"sentiment": result.sentiment,
"confidence": result.confidence,
"scores": result.scores,
})
return analyzed
Step 4: Detect Emotions¶
Add emotion detection:
async def detect_emotions(sm: Sentimatrix, reviews: list) -> list:
"""Detect emotions for reviews with strong sentiments."""
print("Detecting emotions...")
# Only analyze reviews with high confidence
strong_reviews = [r for r in reviews if r["confidence"] > 0.8]
for review in strong_reviews:
emotions = await sm.detect_emotions(review["text"])
review["emotions"] = {
"primary": emotions.primary,
"scores": emotions.scores,
}
return reviews
Step 5: Calculate Statistics¶
Add statistics calculation:
def calculate_stats(reviews: list) -> dict:
"""Calculate aggregate statistics."""
print("Calculating statistics...")
total = len(reviews)
sentiments = {"positive": 0, "negative": 0, "neutral": 0}
for r in reviews:
sentiments[r["sentiment"]] += 1
avg_confidence = sum(r["confidence"] for r in reviews) / total
# Find most helpful positive and negative reviews
positive_reviews = [r for r in reviews if r["sentiment"] == "positive"]
negative_reviews = [r for r in reviews if r["sentiment"] == "negative"]
most_helpful_positive = max(
positive_reviews,
key=lambda x: x.get("helpful_count", 0),
default=None
)
most_helpful_negative = max(
negative_reviews,
key=lambda x: x.get("helpful_count", 0),
default=None
)
return {
"total_reviews": total,
"sentiment_distribution": sentiments,
"sentiment_percentages": {
k: f"{v/total*100:.1f}%" for k, v in sentiments.items()
},
"average_confidence": f"{avg_confidence:.2%}",
"most_helpful_positive": most_helpful_positive["text"][:200] if most_helpful_positive else None,
"most_helpful_negative": most_helpful_negative["text"][:200] if most_helpful_negative else None,
}
Step 6: Generate LLM Summary¶
Add LLM-powered summarization:
async def generate_summary(sm: Sentimatrix, reviews: list) -> dict:
"""Generate an LLM-powered summary of reviews."""
print("Generating AI summary...")
# Get positive and negative samples
positive = [r["text"] for r in reviews if r["sentiment"] == "positive"][:10]
negative = [r["text"] for r in reviews if r["sentiment"] == "negative"][:10]
# Generate summary
summary = await sm.summarize_reviews(
reviews=[{"text": r["text"]} for r in reviews[:50]],
style="professional"
)
# Generate pros and cons
insights = await sm.generate_insights(
reviews=[{"text": r["text"]} for r in reviews[:50]]
)
return {
"summary": summary,
"pros": insights.pros[:5],
"cons": insights.cons[:5],
"recommendation": insights.recommendation,
}
Step 7: Export Results¶
Add export functionality:
def export_results(reviews: list, stats: dict, summary: dict) -> str:
"""Export analysis results to JSON."""
output = {
"metadata": {
"url": GAME_URL,
"analyzed_at": datetime.now().isoformat(),
"total_reviews": len(reviews),
},
"statistics": stats,
"ai_summary": summary,
"reviews": reviews[:20], # Include top 20 reviews
}
filename = f"analysis_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
with open(filename, "w") as f:
json.dump(output, f, indent=2, default=str)
print(f"Results exported to {filename}")
return filename
Step 8: Main Pipeline¶
Tie it all together:
async def main():
"""Run the complete analysis pipeline."""
print("=" * 60)
print("Sentimatrix Game Review Analysis Pipeline")
print("=" * 60)
async with Sentimatrix(CONFIG) as sm:
# Step 1: Scrape reviews
reviews = await scrape_game_reviews(sm)
# Step 2: Analyze sentiments
analyzed = await analyze_sentiments(sm, reviews)
# Step 3: Detect emotions
analyzed = await detect_emotions(sm, analyzed)
# Step 4: Calculate statistics
stats = calculate_stats(analyzed)
# Step 5: Generate AI summary
summary = await generate_summary(sm, analyzed)
# Step 6: Export results
filename = export_results(analyzed, stats, summary)
# Print summary
print("\n" + "=" * 60)
print("ANALYSIS COMPLETE")
print("=" * 60)
print(f"\nTotal Reviews: {stats['total_reviews']}")
print(f"Sentiment Distribution:")
for sentiment, pct in stats['sentiment_percentages'].items():
print(f" {sentiment:>10}: {pct}")
print(f"\nAI Summary:\n{summary['summary'][:500]}...")
print(f"\nResults saved to: {filename}")
if __name__ == "__main__":
asyncio.run(main())
Complete Script¶
Here's the full script:
analyze_game.py (Complete)
#!/usr/bin/env python3
"""
Sentimatrix Game Review Analysis Pipeline
Scrapes Steam reviews, analyzes sentiment, detects emotions,
and generates AI-powered insights.
Usage:
python analyze_game.py
"""
import asyncio
import json
from datetime import datetime
from sentimatrix import Sentimatrix
from sentimatrix.config import SentimatrixConfig, LLMConfig
# Configuration
CONFIG = SentimatrixConfig(
llm=LLMConfig(
provider="groq",
model="llama-3.3-70b-versatile"
)
)
GAME_URL = "https://store.steampowered.com/app/1245620/ELDEN_RING/"
MAX_REVIEWS = 100
async def scrape_game_reviews(sm: Sentimatrix) -> list:
print(f"Scraping reviews from Steam...")
reviews = await sm.scrape_reviews(
url=GAME_URL,
platform="steam",
max_reviews=MAX_REVIEWS
)
print(f"Scraped {len(reviews)} reviews")
return reviews
async def analyze_sentiments(sm: Sentimatrix, reviews: list) -> list:
print("Analyzing sentiments...")
texts = [r.text for r in reviews]
results = await sm.analyze_batch(texts)
analyzed = []
for review, result in zip(reviews, results):
analyzed.append({
"text": review.text,
"rating": review.rating,
"helpful_count": review.helpful_count,
"sentiment": result.sentiment,
"confidence": result.confidence,
"scores": result.scores,
})
return analyzed
async def detect_emotions(sm: Sentimatrix, reviews: list) -> list:
print("Detecting emotions...")
strong_reviews = [r for r in reviews if r["confidence"] > 0.8]
for review in strong_reviews:
emotions = await sm.detect_emotions(review["text"])
review["emotions"] = {
"primary": emotions.primary,
"scores": emotions.scores,
}
return reviews
def calculate_stats(reviews: list) -> dict:
print("Calculating statistics...")
total = len(reviews)
sentiments = {"positive": 0, "negative": 0, "neutral": 0}
for r in reviews:
sentiments[r["sentiment"]] += 1
avg_confidence = sum(r["confidence"] for r in reviews) / total
positive_reviews = [r for r in reviews if r["sentiment"] == "positive"]
negative_reviews = [r for r in reviews if r["sentiment"] == "negative"]
most_helpful_positive = max(
positive_reviews, key=lambda x: x.get("helpful_count", 0), default=None
)
most_helpful_negative = max(
negative_reviews, key=lambda x: x.get("helpful_count", 0), default=None
)
return {
"total_reviews": total,
"sentiment_distribution": sentiments,
"sentiment_percentages": {
k: f"{v/total*100:.1f}%" for k, v in sentiments.items()
},
"average_confidence": f"{avg_confidence:.2%}",
"most_helpful_positive": most_helpful_positive["text"][:200] if most_helpful_positive else None,
"most_helpful_negative": most_helpful_negative["text"][:200] if most_helpful_negative else None,
}
async def generate_summary(sm: Sentimatrix, reviews: list) -> dict:
print("Generating AI summary...")
summary = await sm.summarize_reviews(
reviews=[{"text": r["text"]} for r in reviews[:50]],
style="professional"
)
insights = await sm.generate_insights(
reviews=[{"text": r["text"]} for r in reviews[:50]]
)
return {
"summary": summary,
"pros": insights.pros[:5],
"cons": insights.cons[:5],
"recommendation": insights.recommendation,
}
def export_results(reviews: list, stats: dict, summary: dict) -> str:
output = {
"metadata": {
"url": GAME_URL,
"analyzed_at": datetime.now().isoformat(),
"total_reviews": len(reviews),
},
"statistics": stats,
"ai_summary": summary,
"reviews": reviews[:20],
}
filename = f"analysis_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
with open(filename, "w") as f:
json.dump(output, f, indent=2, default=str)
print(f"Results exported to {filename}")
return filename
async def main():
print("=" * 60)
print("Sentimatrix Game Review Analysis Pipeline")
print("=" * 60)
async with Sentimatrix(CONFIG) as sm:
reviews = await scrape_game_reviews(sm)
analyzed = await analyze_sentiments(sm, reviews)
analyzed = await detect_emotions(sm, analyzed)
stats = calculate_stats(analyzed)
summary = await generate_summary(sm, analyzed)
filename = export_results(analyzed, stats, summary)
print("\n" + "=" * 60)
print("ANALYSIS COMPLETE")
print("=" * 60)
print(f"\nTotal Reviews: {stats['total_reviews']}")
print(f"Sentiment Distribution:")
for sentiment, pct in stats['sentiment_percentages'].items():
print(f" {sentiment:>10}: {pct}")
print(f"\nResults saved to: {filename}")
if __name__ == "__main__":
asyncio.run(main())
Running the Pipeline¶
Expected output:
============================================================
Sentimatrix Game Review Analysis Pipeline
============================================================
Scraping reviews from Steam...
Scraped 100 reviews
Analyzing sentiments...
Detecting emotions...
Calculating statistics...
Generating AI summary...
Results exported to analysis_20250129_143022.json
============================================================
ANALYSIS COMPLETE
============================================================
Total Reviews: 100
Sentiment Distribution:
positive: 72.0%
negative: 18.0%
neutral: 10.0%
Results saved to: analysis_20250129_143022.json
Next Steps¶
- LLM Providers - Try different LLM providers
- Platform Scrapers - Scrape from other platforms
- Configuration - Customize your setup
- Examples - More example pipelines