Source code for city_scrapers_core.spiders.spider

import re
from datetime import datetime
from typing import Mapping, Optional

from pytz import timezone
from scrapy import Spider

from ..constants import CANCELLED, PASSED, TENTATIVE


[docs]class CityScrapersSpider(Spider): """Base Spider class for City Scrapers projects. Provides a few utilities for common tasks like creating a meeting ID and checking the status based on meeting details. """ def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) # Add parameters for feed storage in spider local time if not hasattr(self, "timezone"): self.timezone = "America/Chicago" tz = timezone(self.timezone) now = tz.localize(datetime.now()) self.year = now.year self.month = now.strftime("%m") self.day = now.strftime("%d") self.hour_min = now.strftime("%H%M") def _clean_title(self, title: str) -> str: """Remove cancelled strings from title""" clean_title = re.sub( r"([\s:-]{1,3})?(cancel\w+|rescheduled)([\s:-]{1,3})?", "", title, flags=re.IGNORECASE, ).strip() # Remove leading and trailing pipes, dashes, and colons return re.sub(r"(^[|\-:]\s+|\s*[|\-:]$)", "", clean_title).strip()
[docs] def get_id(self, item: Mapping, identifier: Optional[str] = None) -> str: """Create an ID for a meeting based on its details like title and start time as well as any agency-provided unique identifiers. :param item: Meeting to generate an ID for :param identifier: Optional unique meeting identifier if available, defaults to None :return: ID string based on meeting details """ return self._get_id(item, identifier=identifier)
def _get_id(self, item: Mapping, identifier: Optional[str] = None) -> str: """Create an ID based off of the meeting details, title and any identifiers""" underscore_title = re.sub( r"\s+", "_", re.sub(r"[^A-Z^a-z^0-9^]+", " ", self._clean_title(item["title"])), ).lower() item_id = (identifier or "x").replace("/", "-") start_str = item["start"].strftime("%Y%m%d%H%M") return "/".join([self.name, start_str, item_id, underscore_title])
[docs] def get_status(self, item: Mapping, text: str = "") -> str: """Determine the status of a meeting based off of its details as well as any additional text that may indicate whether it has been cancelled. :param item: Meeting to get the status for :param text: Any additional text not included in the meeting details that may indicate whether it's been cancelled, defaults to "" :return: Status constant """ return self._get_status(item, text=text)
def _get_status(self, item: Mapping, text: str = "") -> str: """ Generates one of the allowed statuses from constants based on the title and time of the meeting """ meeting_text = " ".join( [item.get("title", ""), item.get("description", ""), text] ).lower() if any(word in meeting_text for word in ["cancel", "rescheduled", "postpone"]): return CANCELLED if item["start"] < datetime.now(): return PASSED return TENTATIVE