# Setup URL
= "https://toronto.craigslist.org/search/toronto-on/apa?availabilityMode=0&"
base = "43.66532376779693" # are Rotman school of management
LAT = "-79.39860792142129"
LNG = "3200"
MAX_PRICE = "2"
MIN_BDRM = "1.2" # is 1.9Km
RADIUS = "date"
SORT
= sprintf("%slat=%s&lon=%s&max_price=%s&min_bedrooms=%s&search_distance=%s&sort=%s",
url
base,
LAT,
LNG,
MAX_PRICE,
MIN_BDRM,
RADIUS, SORT)
Webscraping Craigslist Apartments Listings
While looking for an apartment to rent in Toronto, I found myself checking Craigslist multiple times throughout the day for new postings. The recent rising interest rates have led to an increased demand for rental units in Toronto, and consequently made the rental market much more competitive and difficult to maneuver. This post describes the craigslist web crawler bot I created in order to identify new relevant apartment listings, and have the results sent to your phone as push notifications.
Overview
There are three main sections to this process:
- Collect listings from Craigslist using
rvest
- Send push notifications to phone or watch using
pushoverr
- Schedule the scraper to run in regular intervals with
cronR
1. Collect listings from Craigslist
The Craigslist apartment listings URL has a logical structure:
"https://toronto.craigslist.org/search/toronto-on/apa?availabilityMode=0
\______/\_______/\____________________/\________/\___________________ ...
| | | | |
scheme location domain location arguments
We can alter the URL we call to filter listings to satisfy specific requirements. For example, we can use the below arguments to filter the results returned by Craigslist.
Argument | Description |
---|---|
&lat= , &lng= |
Centroid coordinates of search |
&search_distance= |
Radius of search from centroid |
&min_price= |
Minimum listing price |
&max_price= |
Max listing price |
&min_bedrooms= |
Minimum number of bedrooms |
&sort= |
Sorting options, date sorts posts by latest |
Below is an example of how we can generate a URL specific to the filters we are after.
Alternatively, you can use the craigslist website to dial in your search, then copy-paste the URL containing your search query.
With our URL, let’s collect the listings. I suspect the .result-row
, .result.page
, and .result-heading
selectors will line up for searching for apartment listings in other cities.
I am building a data frame containing the post title, link, and time the post was last updated.
library(rvest)
= read_html(url)
page
= page |>
titles html_nodes(".result-heading") |>
html_text(trim = TRUE)
= page |>
links html_nodes(".result-row") |>
html_element("a") |>
html_attr("href")
= page |>
timestamp html_nodes(".result-date") |>
html_attr("datetime")
= as.POSIXct(timestamp, tz = "")
timestamp
# organize into a data frame
= data.frame(
posts
titles,
links,
timestamp )
Let’s take a quick look at the data we’ve extracted from the page.
library(kableExtra)
|>
posts head() |>
kbl(caption = "Craigslist apartment listings") |>
kable_paper("striped", full_width = F)
titles | links | timestamp |
---|---|---|
2. Sending Push Notifications
While there are multiple tools available, I had success using the pushoverr
library paired with the Pushover app. I was able to make an application using Pushover which sends push notifications to my iPhone and Apple Watch.
I found the blog post by Brian Connelly to be helpful setting up Pushover.
# call in API keys for the Pushoverr app
source("secrets.R")
= function(data) {
push
= data$title
msg = data$link
url
::pushover(message = msg,
pushoverruser = USER_KEY,
app = APP_KEY,
url = url)
}
Next, we need to add some logic to check the collected posts for new postings. I scheduled the script to run every 20 minutes, and so I can get away with only pushing notifications for listings with a time stamp in the last 30 minutes.
# for testing, set time to keep the most recent post
# time = as.POSIXct(Sys.time(), tz = "")
= max(timestamp)
time
= 20
THRESH = posts[difftime(time, timestamp, units = "mins") < THRESH, ]
notify
# send push notifications
= nrow(notify)
N for(i in 1:N) {
push(notify[i, ])
# sleep 10s between batches of new postings
if(i < N) {Sys.sleep(10)}
}
3. Scheduling the script
The magic happens when we can have the script run in the background or while we are away from the computer. cronR
is the perfect tool for the job here as we can set this script to run in 20 minute intervals. A quick disclaimer, cronR
runs only on unix/linux, so if you’re using Windows you will have to find another task scheduler.
library(cronR)
= "scrape.R" # the path to your script
script
= cron_rscript(script,
cmd log_append = TRUE,
log_timestamp = TRUE)
cron_add(command = cmd,
frequency ='*/20 * * * *', # run every 20 minutes
id = "CL-Toronto-Apt",
description = "Webscraping Toronto Apartments off Craigslist",
tags = "webscraping")
And voila! We’ve now wrote a script to download the relevant apartment listings, filter for new postings, and send us push notifications for any new listings we may want to check out.
Here are the uninterupted scripts used. You will need to add a secrets.R
file containing your pushover app API keys. My project directory looks like:
├── bot.R
├── schedule.R
└── secrets.R
bot.R
# prepare Craigslist URL ----
= "https://toronto.craigslist.org/search/toronto-on/apa?availabilityMode=0&"
base = "43.66532376779693" # are Rotman school of management
LAT = "-79.39860792142129"
LNG = "3200"
MAX_PRICE = "2"
MIN_BDRM = "1.2" # is 1.9Km
RADIUS = "date"
SORT
= sprintf("%slat=%s&lon=%s&max_price=%s&min_bedrooms=%s&search_distance=%s&sort=%s",
url
base,
LAT,
LNG,
MAX_PRICE,
MIN_BDRM,
RADIUS,
SORT)
# collect listings ----
= read_html(url)
page
= page |>
titles html_nodes(".result-heading") |>
html_text(trim = TRUE)
= page |>
links html_nodes(".result-row") |>
html_element("a") |>
html_attr("href")
= page |>
timestamp html_nodes(".result-date") |>
html_attr("datetime")
= as.POSIXct(timestamp, tz = "")
timestamp
# organize into a data frame
= data.frame(
posts
titles,
links,
timestamp
)
# Send push notifications for new postings ----
source("secrets.R")
= function(data) {
push = data$title
msg = data$link
url ::pushover(message = msg,
pushoverruser = USER_KEY,
app = APP_KEY,
url = url)
}
= as.POSIXct(Sys.time(), tz = "")
time
= 20
THRESH = posts[difftime(time, timestamp, units = "mins") < THRESH, ]
notify
= nrow(notify)
N for(i in 1:N) {
push(notify[i, ])
# sleep 10s between batches of new postings
if(i < N) {Sys.sleep(10)}
}
schedule.R
library(cronR)
= "scrape.R" # the path to your script
script
= cron_rscript(script,
cmd log_append = TRUE,
log_timestamp = TRUE)
cron_add(command = cmd,
frequency ='*/20 * * * *', # run every 20 minutes
id = "CL-Toronto-Apt",
description = "Webscraping Toronto Apartments off Craigslist",
tags = "webscraping")
secrets.R
= "XXXXXXXXXXXXXXXXXX"
USER_KEY = "XXXXXXXXXXXXXXXXXX" APP_KEY