Skip to content

Commit

Permalink
Merge pull request #86 from scicloj/linear-regression-wip
Browse files Browse the repository at this point in the history
Linear regression intro wip
  • Loading branch information
daslu authored Dec 11, 2024
2 parents 3273883 + 59aa280 commit 8fd93fa
Show file tree
Hide file tree
Showing 2 changed files with 106 additions and 19 deletions.
24 changes: 12 additions & 12 deletions notebooks/index.clj
Original file line number Diff line number Diff line change
Expand Up @@ -67,15 +67,15 @@ directly for tabular data structures or provide high interoperability with it.
chapter))

(->> "notebooks/chapters.edn"
slurp
clojure.edn/read-string
(mapcat (fn [{:keys [part chapters]}]
(cons (format "- %s" part)
(->> chapters
(map (fn [chapter]
(prn [chapter (chapter->title chapter)])
(format "\n - [%s](noj_book.%s.html)\n"
(chapter->title chapter)
chapter)))))))
(str/join "\n")
md)
slurp
clojure.edn/read-string
(mapcat (fn [{:keys [part chapters]}]
(cons (format "- %s" part)
(->> chapters
(map (fn [chapter]
(prn [chapter (chapter->title chapter)])
(format "\n - [%s](noj_book.%s.html)\n"
(chapter->title chapter)
chapter)))))))
(str/join "\n")
md)
101 changes: 94 additions & 7 deletions notebooks/noj_book/linear_regression_intro.clj
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,14 @@

(ns noj-book.linear-regression-intro
(:require
[tech.v3.dataset :as ds]
[tablecloth.api :as tc]
[tablecloth.column.api :as tcc]
[tech.v3.datatype.datetime :as datetime]))
[tech.v3.datatype.datetime :as datetime]
[tech.v3.dataset.modelling :as dsmod]
[scicloj.metamorph.ml :as ml]
[fastmath.ml.regression :as reg]
[scicloj.kindly.v4.kind :as kind]))

;; ## Reading and parsing data

Expand All @@ -31,20 +36,102 @@
{:key-fn column-name-mapping
:parser-fn {"Date" [:local-date-time "MM/dd/yyyy hh:mm:ss a"]}}))

counts

(def weather
(tc/dataset "data/seattle-bikes-and-weather/BicycleWeather.csv.gz"
{:key-fn keyword}))

weather

;; ## Preprocessing

;; no good support for this in tablecloth
;; Our bike counts data are hourly, but the weather data is daily.
;; To join them, we will need to convert the bike hourly counts to daily counts.

;; In the Python book, this is done as follows in Pandas:
;; ```python
;; daily = counts.resample('d').sum()
;; ```

;; day column, group by, aggregate, sum.
;; Tablecloth's full support for time series is still under construction.
;; For now, we will have to be a bit more verbose:

(def daily-totals
(-> counts
(tc/group-by (fn [{:keys [datetime]}]
{:date (datetime/local-date-time->local-date
datetime)}))
(tc/aggregate-columns [:total :west :east]
tcc/sum)))


daily-totals

;; ## Prediction by weekday

;; Let us prepare the data for regression on the day of week.


(def days-of-week
[:Mon :Tue :Wed :Thu :Fri :Sat :Sun])


;; We will convert numbers to days-of-week keywords:

(def idx->day-of-week
(comp days-of-week dec))

;; E.g.,
(idx->day-of-week 1)
(idx->day-of-week 7)

;; Now, let us prepare the data:

(def totals-with-day-of-week
(-> daily-totals
(tc/add-column :day-of-week
(fn [ds]
(map idx->day-of-week
(datetime/long-temporal-field
:day-of-week
(:date ds)))))
(tc/select-columns [:total :day-of-week])))

totals-with-day-of-week

(def totals-with-one-hot-days-of-week
(-> (reduce (fn [dataset day-of-week]
(-> dataset
(tc/add-column day-of-week
#(-> (:day-of-week %)
(tcc/eq day-of-week)
;; turn booleans into 0s and 1s
(tcc/* 1)))))
totals-with-day-of-week
days-of-week)
(tc/drop-columns [:day-of-week])))

totals-with-one-hot-days-of-week

;; Let us compute the linear regression model using Fastmath.
;; The binary columns are collinear (sum up to 1),
;; but we will avoide the intercept.
;; This way, the interpretation of each coefficient is the expected
;; bike count for the corresponding day of week.

(def fit
(reg/lm (:total totals-with-one-hot-days-of-week)
(-> totals-with-one-hot-days-of-week
(tc/drop-columns [:total])
tc/rows)
{:intercept? false}))

;; Here are the regression results:

(-> fit
println
with-out-str
kind/code)

(-> counts
(tc/group-by (fn [{:keys [datetime]}]
{:date (datetime/local-date-time->local-date datetime)}))
(tc/aggregate {:total (comp tcc/sum :total)}))
;; We can see the difference between weekends and weekdays.

0 comments on commit 8fd93fa

Please sign in to comment.