import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os
bautismos_path = "../data/clean/bautismos_clean.csv"
entierros_path = "../data/clean/entierros_clean.csv"
matrimonios_path = "../data/clean/matrimonios_clean.csv"

bautismos = pd.read_csv(bautismos_path)
entierros = pd.read_csv(entierros_path)
matrimonios = pd.read_csv(matrimonios_path)
os.makedirs("../figures", exist_ok=True)
def prepare_events(df, sacrament_label):
    df = df.copy()
    df["event_date"] = pd.to_datetime(df["event_date"], errors="coerce")
    df = df.dropna(subset=["event_date"])
    df["year"] = df["event_date"].dt.year
    df["sacrament"] = sacrament_label
    return df[["year", "sacrament"]]

bautismos_yearly = prepare_events(bautismos, "Baptisms")
entierros_yearly = prepare_events(entierros, "Burials")
matrimonios_yearly = prepare_events(matrimonios, "Marriages")
events_all = pd.concat(
    [bautismos_yearly, entierros_yearly, matrimonios_yearly],
    ignore_index=True
)

yearly_counts = (
    events_all
    .groupby(["year", "sacrament"])
    .size()
    .reset_index(name="count")
)

yearly_pivot = yearly_counts.pivot(index="year", columns="sacrament", values="count")

yearly_pivot = yearly_pivot.fillna(0).sort_index()


years = yearly_pivot.index.values
cols  = yearly_pivot.columns

fig, ax = plt.subplots()

bottom = np.zeros(len(years))  

for sacrament in cols:
    counts = yearly_pivot[sacrament].values
    ax.bar(
        years,
        counts,
        bottom=bottom,
        label=sacrament
    )
    bottom += counts  

ax.set_xlabel("Year")
ax.set_ylabel("Number of events")
ax.legend(title="Event Type")

plt.tight_layout()
plt.show()

fig.savefig("../figures/distribution_events_per_year.png", dpi=300)

personas = pd.read_csv("../data/clean/personas.csv")
places = pd.read_csv("../data/clean/places.csv")
/var/folders/37/816y3tg93hn7pnnrd0c33c_00000gn/T/ipykernel_80252/254567552.py:1: DtypeWarning: Columns (13,14,15) have mixed types. Specify dtype option on import or set low_memory=False.
  personas = pd.read_csv("../data/clean/personas.csv")
places['place_name'] = places['place_name'].str.strip().str.lower()
birth_places = personas.merge(places, left_on='birth_place', right_on='place_name', how='left')
birth_places = birth_places.dropna(subset=['latitude', 'longitude'])
birth_places = birth_places[['birth_place', 'latitude', 'longitude']]
birth_places_count = birth_places.groupby(['birth_place', 'latitude', 'longitude']).size().reset_index(name='count')
birth_places_count.sort_values('count', ascending=False, inplace=True)
birth_places_count.to_csv("../data/stats/birth_places_count.csv", index=False)
birth_places_count.describe()
latitude longitude count
count 72.000000 72.000000 72.000000
mean -14.108773 -74.204983 76.222222
std 0.508643 0.560368 305.530304
min -15.278870 -76.484250 1.000000
25% -14.286957 -74.274407 1.000000
50% -14.177605 -74.036060 2.000000
75% -13.922675 -73.944478 5.000000
max -11.775270 -73.185060 1903.000000
birth_places_count
birth_place latitude longitude count
52 pampamarca -14.23274 -74.04139 1903
4 aucará -14.28099 -73.97489 1505
15 chacralla -14.20690 -73.99316 845
38 ishua -14.24973 -73.95556 640
62 santa ana de aucará -14.19929 -74.08823 108
... ... ... ... ...
39 jauja -11.77527 -75.50007 1
40 julcamarca -13.01443 -74.44459 1
45 nasca -14.82763 -74.93700 1
46 ocaña -14.39862 -74.82269 1
71 yanaccollpa -14.54731 -74.26170 1

72 rows × 4 columns

most_places = sum([1903,1505,845,640])
most_places
4893
perc = most_places * 100 / birth_places_count['count'].sum()
perc.round(0)
np.float64(89.0)