generated from databricks-industry-solutions/industry-solutions-blueprints
-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy path03-config.py
59 lines (48 loc) · 2.13 KB
/
03-config.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
# Databricks notebook source
# MAGIC %md
# MAGIC You may find this series of notebooks at https://github.com/databricks-industry-solutions/ocr-phi-masking. For more information about this solution accelerator, visit https://www.databricks.com/solutions/accelerators/automated-phi-removal.
# COMMAND ----------
class SolAccUtil:
def __init__(self,project_name,base_path=None):
if base_path!=None:
self.base_path=base_path
else:
user=dbutils.notebook.entry_point.getDbutils().notebook().getContext().tags().apply('user')
self.base_path = f'/home/{user}/hls'
self.project_name = project_name.strip().replace(' ','-')
self.data_path=f'{self.base_path}/{self.project_name}/data'
self.delta_path=f'{self.base_path}/{self.project_name}/delta'
dbutils.fs.mkdirs(self.base_path)
dbutils.fs.mkdirs(self.data_path)
dbutils.fs.mkdirs(self.delta_path)
def load_remote_data(self,url,unpack=False):
import requests
fname=url.split('/')[-1]
r = requests.get(url)
open(f'/dbfs{self.data_path}/{fname}','wb').write(r.content)
if unpack:
import tarfile
# open file
file = tarfile.open(f'/dbfs{self.data_path}/{fname}')
file.extractall(f'/dbfs{self.data_path}')
file.close()
self.display_data()
def print_paths(self):
html_str = f"""<p>
<b>base_path</b> = <i>{self.base_path}</i><br>
<b>data_path</b> [where your raw data will be stored]= <i>{self.data_path}</i> <br>
<b>delta_path</b> [where your delta tables will be stored] = <i>{self.delta_path}</i> <br>
</p>"""
displayHTML(html_str)
def display_data(self):
print('*'*100)
print(f'data available in {self.data_path} are:')
print('*'*100)
files=dbutils.fs.ls(f'{self.data_path}')
if len(files)==0:
print('no data available, please run load_remote_data(<url for the data>)')
else:
display(files)
# COMMAND ----------
# MAGIC %md
# MAGIC You have access to `SolAccUtil` class. By initializing this class, you create paths where you store synthetically generated raw data, root path for your delta tables and download data for testing.