scripts/scrape_cves.py

import requests
from bs4 import BeautifulSoup
import json
import os
import time
from typing import List, Dict

def create_search_dirs(applications: List[str]) -> None:
    """Create directories for search results if they don't exist."""
    base_dir = "search_results"
    if not os.path.exists(base_dir):
        os.makedirs(base_dir)
    
    for app in applications:
        app_dir = os.path.join(base_dir, app.lower())
        if not os.path.exists(app_dir):
            os.makedirs(app_dir)

def scrape_cves(application: str) -> List[Dict]:
    """Scrape CVEs for a given application."""
    url = f"https://cve.mitre.org/cgi-bin/cvekey.cgi?keyword={application}"
    
    # Add headers to mimic a browser request
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        
        soup = BeautifulSoup(response.text, 'html.parser')
        cve_entries = []
        
        # Find all table rows containing CVE information
        rows = soup.find_all('tr')
        
        for row in rows:
            # Find CVE ID and description cells
            cells = row.find_all('td')
            if len(cells) == 2:  # Valid CVE row has 2 cells
                cve_id = cells[0].get_text(strip=True)
                description = cells[1].get_text(strip=True)
                
                if cve_id.startswith('CVE-'):
                    cve_entries.append({
                        'cve_id': cve_id,
                        'description': description
                    })
        
        return cve_entries
    
    except requests.RequestException as e:
        print(f"Error scraping {application}: {str(e)}")
        return []

def main():
    # List of applications from the README
    applications = [
        "Postgres",
        "MySQL",
        "Redis",
        "Nginx",
        "Apache",
        "Chrome",
        "Firefox",
        "Kubernetes",
        "Docker",
        "vCenter",
        "Minecraft"
    ]
    
    # Create necessary directories
    create_search_dirs(applications)
    
    # Scrape CVEs for each application
    for app in applications:
        print(f"Scraping CVEs for {app}...")
        cves = scrape_cves(app)
        
        # Save results to JSON file
        output_file = os.path.join("search_results", app.lower(), "search_cves.json")
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(cves, f, indent=2, ensure_ascii=False)
        
        print(f"Saved {len(cves)} CVEs for {app}")
        
        # Add delay to be respectful to the server
        time.sleep(2)

if __name__ == "__main__":
    main()