import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import osbautismos_path = "../data/clean/bautismos_clean.csv"
entierros_path = "../data/clean/entierros_clean.csv"
matrimonios_path = "../data/clean/matrimonios_clean.csv"
bautismos = pd.read_csv(bautismos_path)
entierros = pd.read_csv(entierros_path)
matrimonios = pd.read_csv(matrimonios_path)os.makedirs("../figures", exist_ok=True)def prepare_events(df, sacrament_label):
df = df.copy()
df["event_date"] = pd.to_datetime(df["event_date"], errors="coerce")
df = df.dropna(subset=["event_date"])
df["year"] = df["event_date"].dt.year
df["sacrament"] = sacrament_label
return df[["year", "sacrament"]]
bautismos_yearly = prepare_events(bautismos, "Baptisms")
entierros_yearly = prepare_events(entierros, "Burials")
matrimonios_yearly = prepare_events(matrimonios, "Marriages")events_all = pd.concat(
[bautismos_yearly, entierros_yearly, matrimonios_yearly],
ignore_index=True
)
yearly_counts = (
events_all
.groupby(["year", "sacrament"])
.size()
.reset_index(name="count")
)
yearly_pivot = yearly_counts.pivot(index="year", columns="sacrament", values="count")
yearly_pivot = yearly_pivot.fillna(0).sort_index()
years = yearly_pivot.index.values
cols = yearly_pivot.columns
fig, ax = plt.subplots()
bottom = np.zeros(len(years))
for sacrament in cols:
counts = yearly_pivot[sacrament].values
ax.bar(
years,
counts,
bottom=bottom,
label=sacrament
)
bottom += counts
ax.set_xlabel("Year")
ax.set_ylabel("Number of events")
ax.legend(title="Event Type")
plt.tight_layout()
plt.show()
fig.savefig("../figures/distribution_events_per_year.png", dpi=300)
personas = pd.read_csv("../data/clean/personas.csv")
places = pd.read_csv("../data/clean/places.csv")/var/folders/37/816y3tg93hn7pnnrd0c33c_00000gn/T/ipykernel_80252/254567552.py:1: DtypeWarning: Columns (13,14,15) have mixed types. Specify dtype option on import or set low_memory=False.
personas = pd.read_csv("../data/clean/personas.csv")
places['place_name'] = places['place_name'].str.strip().str.lower()
birth_places = personas.merge(places, left_on='birth_place', right_on='place_name', how='left')birth_places = birth_places.dropna(subset=['latitude', 'longitude'])
birth_places = birth_places[['birth_place', 'latitude', 'longitude']]birth_places_count = birth_places.groupby(['birth_place', 'latitude', 'longitude']).size().reset_index(name='count')
birth_places_count.sort_values('count', ascending=False, inplace=True)birth_places_count.to_csv("../data/stats/birth_places_count.csv", index=False)birth_places_count.describe()| latitude | longitude | count | |
|---|---|---|---|
| count | 72.000000 | 72.000000 | 72.000000 |
| mean | -14.108773 | -74.204983 | 76.222222 |
| std | 0.508643 | 0.560368 | 305.530304 |
| min | -15.278870 | -76.484250 | 1.000000 |
| 25% | -14.286957 | -74.274407 | 1.000000 |
| 50% | -14.177605 | -74.036060 | 2.000000 |
| 75% | -13.922675 | -73.944478 | 5.000000 |
| max | -11.775270 | -73.185060 | 1903.000000 |
birth_places_count| birth_place | latitude | longitude | count | |
|---|---|---|---|---|
| 52 | pampamarca | -14.23274 | -74.04139 | 1903 |
| 4 | aucará | -14.28099 | -73.97489 | 1505 |
| 15 | chacralla | -14.20690 | -73.99316 | 845 |
| 38 | ishua | -14.24973 | -73.95556 | 640 |
| 62 | santa ana de aucará | -14.19929 | -74.08823 | 108 |
| ... | ... | ... | ... | ... |
| 39 | jauja | -11.77527 | -75.50007 | 1 |
| 40 | julcamarca | -13.01443 | -74.44459 | 1 |
| 45 | nasca | -14.82763 | -74.93700 | 1 |
| 46 | ocaña | -14.39862 | -74.82269 | 1 |
| 71 | yanaccollpa | -14.54731 | -74.26170 | 1 |
72 rows × 4 columns
most_places = sum([1903,1505,845,640])
most_places4893
perc = most_places * 100 / birth_places_count['count'].sum()
perc.round(0)np.float64(89.0)