Adding Scrapers¶
Guide to adding new platform scrapers to Sentimatrix.
Scraper Structure¶
sentimatrix/providers/scrapers/platforms/
├── base.py # Base class
├── amazon.py # Example scraper
└── your_platform.py # New scraper
Implement Base Class¶
from sentimatrix.providers.scrapers.base import BaseScraper
from sentimatrix.models import Review
class YourPlatformScraper(BaseScraper):
"""Scraper for YourPlatform."""
@property
def platform_name(self) -> str:
return "your_platform"
@property
def supported_url_patterns(self) -> list[str]:
return [
r"https?://yourplatform\.com/.*",
r"https?://www\.yourplatform\.com/.*",
]
async def validate_url(self, url: str) -> bool:
return "yourplatform.com" in url
async def scrape(
self,
url: str,
max_reviews: int = 50,
**kwargs
) -> list[Review]:
reviews = []
# Fetch page
html = await self._fetch(url)
# Parse reviews
for element in self._parse_reviews(html):
review = Review(
text=element.text,
rating=element.rating,
author=element.author,
posted_date=element.date,
helpful_count=element.helpful,
platform=self.platform_name,
metadata={
"review_id": element.id,
# Platform-specific data
}
)
reviews.append(review)
if len(reviews) >= max_reviews:
break
return reviews
Register Scraper¶
Add to ScraperProvider enum if needed:
Register in scraper manager:
Add Tests¶
# tests/scrapers/test_your_platform.py
import pytest
from sentimatrix.providers.scrapers.platforms.your_platform import YourPlatformScraper
@pytest.mark.asyncio
async def test_scrape():
scraper = YourPlatformScraper()
reviews = await scraper.scrape(
"https://yourplatform.com/product/123",
max_reviews=10
)
assert len(reviews) <= 10
assert all(r.platform == "your_platform" for r in reviews)
def test_validate_url():
scraper = YourPlatformScraper()
assert scraper.validate_url("https://yourplatform.com/product/123")
assert not scraper.validate_url("https://other.com/product/123")
Add Documentation¶
Create docs/scrapers/your-platform.md with:
- Quick start example
- Supported URL formats
- Review object fields
- Rate limits
- Platform-specific options