Data Setup
import polars as pl
from xgboost import XGBRegressor
from sklearn.linear_model import Ridge
# Data taken from https://www.openml.org/search?type=data&id=43093
df = (
pl.scan_csv("../data/miami-housing.csv")
.with_columns([
pl.col("SALE_PRC").alias("price"),
pl.col(["LATITUDE", "LONGITUDE"]).name.to_lowercase()
])
.select(pl.col(["latitude", "longitude", "price"]))
)
TRAIN_TEST_SPLIT_FRACTION = 0.8
df = (
df
# Shuffle the data to avoid any issues from the data being pre-ordered...
.sample(fraction=1, shuffle=True)
# ...then use row numbers as an index for separating train and test.
.with_row_count(name="row_number")
.with_columns([
(pl.col("row_number") < TRAIN_TEST_SPLIT_FRACTION * len(df)).alias("is_train")
])
)
Feature Engineering
- Raw Latitude and Longitude
- Spatial Density
- Population density is correlated with many demographic processes, and this is certainly true for rental prices and incomes.
- Approach
- One could use many methods for measuring the spatial density around a home: counting the number of other home sales within some radius of each home sale
- or computing and sampling from a Kernel Density Estimate over home sale locations
- or even pulling third party census data about population density.
- Case(scipy's CKDTree 이용해서 Density Feature 생성)
def add_density_feature_columns_to_dataframe(geo_df: pl.DataFrame) -> pl.DataFrame:
tree = spatial.cKDTree(df.select(["latitude", "longitude"]))
result = geo_df.with_columns(
pl.Series(
"spatial_density",
tree.query_ball_point(geo_df.select(["latitude", "longitude"]), .005, return_length=True)
)
)
return result
df_w_density = add_density_feature_columns_to_dataframe(df)
- Geohash Target Encoding(Ref)
- It’s a known fact — some neighborhoods are more expensive than others. So, it’s possible that giving information to the model about each home’s neighborhood (and the sale price that can be expected in that neighborhood) can add predictive power.
- A neighborhood can be anything — a zip-code, a street, or in our case, a Geohash.
def add_geohash_column_to_df(geo_df: pl.DataFrame) -> pl.DataFrame:
result = (
df
.with_columns(
df
.select("latitude", "longitude")
.map_rows(
lambda x: geohash2.encode(x[0], x[1], precision=5),
return_dtype=pl.Utf8
)
.rename({"map": "geohash"})
)
)
return result
def add_target_encoding_to_df(
dataframe: pl.DataFrame,
categorical_column: str = "geohash"
) -> pl.DataFrame:
category_target_means = (
dataframe
.filter(pl.col("is_train")) # Only include train data to prevent test data leakage.
.group_by(categorical_column)
.agg(
pl.col(MODEL_TARGET).mean().alias(f"{categorical_column}_{MODEL_TARGET}_mean")
)
)
result = (
dataframe
.join(
category_target_means,
how="left",
on=categorical_column
)
)
return result
df_w_geohash = add_geohash_column_to_df(df)
df_w_geohash_target_encoded = add_target_encoding_to_df(df_w_geohash)
Reference