::p_load(dplyr, stringr, kableExtra)
pacman= function (f) read.csv(file.path(here::here("data"), f))
load_data
= load_data("noahs-orders.csv")
orders = load_data("noahs-orders_items.csv")
orders_items = load_data("noahs-products.csv")
products = load_data("noahs-customers.csv") customers
Solving the Hanukkah of Data Puzzles
R
Python
Coding Puzzles
Code solutions for the Hanukkah of Data challenge.
The Hanukkah of Data provides eight short daily coding puzzles during Hanukkah. I found this set of puzzles to be a great for improving skills in data wrangling, I initially wrote my solutions in R but these puzzles served as a good practice set for getting some reps in with data cleaning in Python.
The puzzle solutions can be a little repetitive, however the solutions are not immediately apparent from the prompts, and I found myself looking forward to solving the riddles each morning. With each puzzle you unlock more of the artwork in Noah’s Rug.
Spoiler warning – Below are my solutions to the puzzles.
Lets read in the data, load our packages, and do some pre-processing.
import pandas as pd
= pd.read_csv("data/noahs-orders.csv")
orders = pd.read_csv("data/noahs-orders_items.csv")
orders_items = pd.read_csv("data/noahs-products.csv")
products = pd.read_csv("data/noahs-customers.csv")
customers
= (
data
customers= "customerid")
.merge(orders, on = "orderid")
.merge(orders_items, on = "sku")
.merge(products, on )
|>
customers mutate(name = gsub(" jr.| iii| v| ii| iv | i", "", tolower(name)),
last_name = str_extract(name, "\\w+$"),
name_number = str_replace_all(last_name, "a|b|c", "2") |>
str_replace_all("d|e|f", "3") |>
str_replace_all("g|h|i", "4") |>
str_replace_all("j|k|l", "5") |>
str_replace_all("m|n|o", "6") |>
str_replace_all("p|q|r|s", "7") |>
str_replace_all("t|u|v", "8") |>
str_replace_all("w|x|y|z", "9"),
phone_number = gsub("-", "", phone)) |>
filter(name_number == phone_number) |>
select(name, phone)
name phone
1 sam tannenbaum 826-636-2286
= {
letter_to_phone "a": "2", "b": "2", "c": "2",
"d": "3", "e": "3", "f": "3",
"g": "4", "h": "4", "i": "4",
"j": "5", "k": "5", "l": "5",
"m": "6", "n": "6", "o": "6",
"p": "7", "q": "7", "r": "7", "s": "7",
"t": "8", "u": "8", "v": "8",
"w": "9", "x": "9", "y": "9", "z": "9"
}
for index, row in customers.iterrows():
= row['name']
name = row['phone']
phone
= (
lname
name
.lower()"jr.| iii| v| ii| iv | i", "")
.replace(" ")[-1]
.split(
)
for word, replacement in letter_to_phone.items():
= lname.replace(word, replacement)
lname
if str(lname) == str(phone).replace("-", ""):
print(f"Solution:\nName: {name}\nPhone: {phone}")
Solution:
Name: Sam Tannenbaum
Phone: 826-636-2286
= products |>
bagel_skus filter(grepl("bagel", tolower(desc))) |>
pull(sku)
= orders_items |>
bagel_order_ids filter(sku %in% bagel_skus)
= orders |>
bagel_order_customers mutate(year = lubridate::year(shipped)) |>
filter(orderid %in% bagel_order_ids$orderid,
== 2017) |>
year pull(customerid)
|>
customers filter(customerid %in% bagel_order_customers) |>
mutate(name = gsub(" jr.| iii| v| ii| iv | i", "", tolower(name)),
last_name = str_extract(name, "\\w+$"),
first_name = str_extract(name, "^\\w+"),
initals = paste0(str_sub(first_name, 1, 1),
str_sub(last_name, 1, 1))) |>
filter(initals == "jp") |>
select(name, phone)
name phone
1 joshua peterson 332-274-4185
= data[data["desc"].str.contains("bagel", case = False) &
cust "shipped"].str.contains("2017")]
data[
for index, row in cust.iterrows():
= row['name']
name = row['phone']
phone
= (
names
name
.lower()"jr.| iii| v| ii| iv | i", "")
.replace(" ")
.split(
)= names[0][0] + names[-1][0]
initals
if initals == "jp":
print(f"Solution:\nName: {name}\nPhone: {phone}")
Solution:
Name: Joshua Peterson
Phone: 332-274-4185
= c(1939, 1951, 1963, 1975, 1987, 1999)
rabbit_years
= customers |>
neighborhood filter(phone == "332-274-4185") |>
pull("citystatezip")
|>
customers mutate(year = lubridate::year(birthdate),
month = lubridate::month(birthdate),
day = lubridate::day(birthdate)) |>
filter(year %in% rabbit_years,
== 6 & day >= 21 | month == 7 & day <= 22,
month == neighborhood) |>
citystatezip select(name, phone)
name phone
1 Robert Morton 917-288-9635
= [1939, 1951, 1963, 1975, 1987, 1999]
rabbit_years = customers[customers["phone"] == "332-274-4185"]["citystatezip"].iloc[0]
neighborhood
"birthdate"] = pd.to_datetime(data["birthdate"])
data[
= data["birthdate"].dt.month
birth_month = data["birthdate"].dt.day
birth_day = data["birthdate"].dt.year
birth_year
= (birth_month == 6) & (birth_day >= 21)
june_f = (birth_month == 7) & (birth_day <= 22)
july_f = birth_year.isin(rabbit_years)
rabbit_year_f = data["citystatezip"] == neighborhood
neighborhood_f
= (june_f | july_f) & rabbit_year_f & neighborhood_f
conditions
# Apply the filter
"name", "phone"]] data[conditions][[
name phone
198918 Robert Morton 917-288-9635
= products |>
bakery_skus filter(grepl("bky", tolower(sku))) |>
filter(!grepl("bagel", tolower(desc)))
= orders_items |>
bakery_order_items filter(sku %in% bakery_skus$sku,
> 1) |>
qty pull("orderid")
= orders |>
bakery_orders_customers filter(orderid %in% bakery_order_items,
::hour(shipped) < 5,
lubridate== shipped)
ordered
|>
customers filter(customerid %in% bakery_orders_customers$customerid) |>
select(name, phone)
name phone
1 Renee Harmon 607-231-3605
"shipped"] = pd.to_datetime(data["shipped"])
data["ordered"] = pd.to_datetime(data["ordered"])
data[
= data["sku"].str.contains("bky", case=False)
sku_f = data["shipped"].dt.hour < 5
hour_f = data["shipped"] == data["ordered"]
in_store_f = data["qty"] > 1
qty_f
= sku_f & hour_f & in_store_f & qty_f
conditions
"name", "phone"]].drop_duplicates() data[conditions][[
name phone
18500 Renee Harmon 607-231-3605
= products |>
old_cat_food filter(grepl("senior cat", tolower(desc))) |>
pull("sku")
= orders_items |>
cat_order_items filter(sku %in% old_cat_food,
== 10) |>
qty pull("orderid")
|>
orders filter(orderid %in% cat_order_items) |>
left_join(customers, by = "customerid") |>
select(name, phone) |>
unique()
name phone
1 Nicole Wilson 631-507-6048
= data["desc"].str.contains("senior cat", case=False)
cat_desc = data["qty"] == 10
cat_qty
& cat_qty][["name", "phone"]].drop_duplicates() data[cat_desc
name phone
6140 Nicole Wilson 631-507-6048
# check which prices are offered below wholesale cost
= orders_items |>
orders_below_cost left_join(products |> select(wholesale_cost, sku),
by = "sku") |>
mutate(sale = unit_price < wholesale_cost) |>
filter(sale) |>
pull(orderid)
|>
orders filter(orderid %in% orders_below_cost) |>
group_by(customerid) |>
summarise(num_purchases = n()) |>
arrange(desc(num_purchases)) |>
left_join(customers |> select(customerid, name, phone), by = "customerid") |>
filter(num_purchases == max(num_purchases)) |>
select(name, phone)
# A tibble: 1 × 2
name phone
<chr> <chr>
1 Sherri Long 585-838-9161
= data["unit_price"] < data["wholesale_cost"]
below_cost
= (
counts
data[below_cost]"name", "phone"])
.groupby([= pd.NamedAgg(column="orderid", aggfunc="nunique"))
.agg(n_below_cost
)
"n_below_cost"] == counts["n_below_cost"].max()] counts[counts[
n_below_cost
name phone
Sherri Long 585-838-9161 31
= "585-838-9161"
day_6_phone
= customers |>
past_customer filter(phone == day_6_phone) |>
pull(customerid)
= products |>
color_purchases left_join(orders_items |> select(orderid, sku), by = "sku") |>
left_join(orders |> select(orderid, customerid,
ordered, shipped), by = "orderid") |>
filter(grepl("col", tolower(sku)),
== shipped) |>
ordered mutate(item = str_extract(desc, "\\w+\\s\\w+"),
colour = str_extract(desc, "\\(\\w+\\)"),
colour = gsub("\\(|\\)", "", colour))
= color_purchases |>
gf_purchases filter(customerid == past_customer,
grepl("\\(", desc))
= function(dat) {
search_for_purchases
= dat$item
gf_item = dat$colour
gf_colour = dat$ordered
gf_time
= color_purchases |>
out filter(item == gf_item & colour != gf_colour,
abs(difftime(ordered, gf_time, units = "mins")) < 10)
return(out)
}
# loop through girlfriend's purchases
lapply(1:nrow(gf_purchases), function(i) {
search_for_purchases(gf_purchases[i, ])
|>
}) bind_rows() |>
select(customerid) |>
left_join(customers, by = "customerid") |>
select(customerid, name, phone)
customerid name phone
1 5783 Carlos Myers 838-335-7157
= "585-838-9161"
day_6_phone
"color"] = data["desc"].str.extract(r"\((\w+)\)")
data["item"] = data["desc"].str.extract(r"(\w+\s\w+)")
data["ordered"] = pd.to_datetime(data["ordered"])
data[
= data[data["sku"].str.contains("col", case=False)]
color_data = color_data[color_data["phone"] == day_6_phone]
gf_items
for color, item, time in zip(gf_items["color"],
"item"],
gf_items["ordered"]):
gf_items[
= color_data["ordered"] - time
time_delta = color_data["color"] != color
diff_color = color_data["item"] == item
same_item = abs(time_delta) < pd.Timedelta("10 minutes")
same_time
= same_time & same_item & diff_color
candidate
if candidate.any():
"name", "phone"]].drop_duplicates() color_data[candidate][[
name phone
324740 Carlos Myers 838-335-7157
# find customer with the most collectible (col) purchases
|>
products filter(grepl("col", tolower(sku))) |>
left_join(orders_items |> select(orderid, sku), by = "sku") |>
left_join(orders |> select(orderid, customerid), by = "orderid") |>
group_by(customerid) |>
tally() |>
left_join(customers |> select(customerid, name, phone), by = "customerid") |>
arrange(desc(n)) |>
filter(n == max(n)) |>
select(name, phone)
# A tibble: 1 × 2
name phone
<chr> <chr>
1 James Smith 212-547-3518
= data[data["sku"].str.contains("col", case=False)]
collectables
(
collectables"name", "phone"])
.groupby([=pd.NamedAgg(column="orderid", aggfunc="nunique"))
.agg(n_collectables"n_collectables", ascending=False)
.sort_values(1)
.head( )
n_collectables
name phone
James Smith 212-547-3518 109