Joshua Sundance Bailey commited on
Commit
9d62f11
1 Parent(s): d362ca8
geospatial-data-converter/kml_tricks.py CHANGED
@@ -1,4 +1,5 @@
1
  import zipfile
 
2
 
3
  import bs4
4
  import fiona
@@ -8,122 +9,90 @@ import pandas as pd
8
  fiona.drvsupport.supported_drivers["KML"] = "rw"
9
 
10
 
11
- def desctogdf(gdf: gpd.GeoDataFrame) -> gpd.GeoDataFrame:
12
- """Parses Descriptions from Google Earth file to create a legit gpd.GeoDataFrame"""
13
- dfs = []
14
- len(gdf)
15
- # pull chunks of data from feature descriptions
16
- for idx, desc in enumerate(gdf["Description"], start=1):
17
- try:
18
- tmpdf = pd.read_html(desc)[1].T
19
- except IndexError:
20
- tmpdf = pd.read_html(desc)[0].T
21
- tmpdf.columns = tmpdf.iloc[0]
22
- tmpdf = tmpdf.iloc[1:]
23
- dfs.append(tmpdf)
24
- # join chunks together
25
- ccdf = pd.concat(dfs, ignore_index=True)
26
- ccdf["geometry"] = gdf["geometry"]
27
- df = gpd.GeoDataFrame(ccdf, crs=gdf.crs)
28
- return df
29
-
30
-
31
- def readkmz(path: str) -> gpd.GeoDataFrame:
32
- """Simply read kmz using geopandas/fiona without parsing Descriptions"""
33
- # get name of kml in kmz (should be doc.kml but we don't assume)
34
  with zipfile.ZipFile(path, "r") as kmz:
35
- namelist = [f for f in kmz.namelist() if f.endswith(".kml")]
36
- if len(namelist) != 1:
37
- # this should never really happen
38
  raise IndexError(
39
- "kmz contains more than one kml. Extract or convert to multiple kmls.",
40
  )
41
- # return GeoDataFrame by reading contents of kmz
42
- return gpd.read_file("zip://{}\\{}".format(path, namelist[0]), driver="KML")
43
 
44
 
45
- def ge_togdf(path: str) -> gpd.GeoDataFrame:
46
- """Return gpd.GeoDataFrame after reading kmz or kml and parsing Descriptions"""
47
  if path.endswith(".kml"):
48
- gdf = desctogdf(gpd.read_file(path, driver="KML"))
49
- elif path.endswith(".kmz"):
50
- gdf = desctogdf(readkmz(path))
51
- else:
52
- raise ValueError("File must end with .kml or .kmz")
53
- return gdf
54
-
55
-
56
- def simpledata_fromcode(kmlcode: str) -> pd.DataFrame:
57
- """Return DataFrame extracted from KML code
58
- parameter kmlcode (str): kml source code
59
- Uses simpledata tags, NOT embedded tables in feature descriptions
60
- """
61
- # get the KML source code as a BeautifulSoup object
62
- soup = bs4.BeautifulSoup(kmlcode, "html.parser")
63
- # find all rows (schemadata tags) in the soup
64
- rowtags = soup.find_all("schemadata")
65
- # generator expression yielding a {name: value} dict for each row
66
- rowdicts = (
67
  {field.get("name"): field.text for field in row.find_all("simpledata")}
68
- for row in rowtags
69
  )
70
- # return pd.DataFrame from row dict generator
71
- return pd.DataFrame(rowdicts)
72
-
73
-
74
- def kmlcode_fromfile(gefile: str) -> str:
75
- """Return kml source code (str) extracted from Google Earth File
76
- parameter gefile (str): absolute or relative path to Google Earth file
77
- (kmz or kml)
78
- Uses simpledata tags, NOT embedded tables in feature descriptions
79
- """
80
- fileextension = gefile.lower().split(".")[-1]
81
- if fileextension == "kml":
82
- with open(gefile, "r") as kml:
83
- kmlsrc = kml.read()
84
- elif fileextension == "kmz":
85
- with zipfile.ZipFile(gefile) as kmz:
86
- # there should only be one kml file and it should be named doc.kml
87
- # we won't make that assumption
88
- kmls = [f for f in kmz.namelist() if f.lower().endswith(".kml")]
89
- if len(kmls) != 1:
90
- raise IndexError(
91
- "kmz contains more than one kml. Extract or convert to multiple kmls.",
92
- )
93
- with kmz.open(kmls[0]) as kml:
94
- # .decode() because zipfile.ZipFile.open(name).read() -> bytes
95
- kmlsrc = kml.read().decode()
96
- else:
97
- raise ValueError("parameter gefile must end with .kml or .kmz")
98
- return kmlsrc
99
-
100
-
101
- def simpledata_fromfile(gefile: str) -> pd.DataFrame:
102
- """Return DataFrame extracted from Google Earth File
103
- parameter gefile (str): absolute or relative path to Google Earth file
104
- (kmz or kml)
105
- Uses simpledata tags, NOT embedded tables in feature descriptions
106
- """
107
- df = simpledata_fromcode(kmlcode_fromfile(gefile))
108
- if gefile.endswith(".kmz"):
109
- gefile_gdf = readkmz(gefile)
110
  else:
111
- gefile_gdf = gpd.read_file(gefile, driver="KML")
112
- gdf = gpd.GeoDataFrame(df, geometry=gefile_gdf["geometry"], crs=gefile_gdf.crs)
113
- return gdf
114
-
115
-
116
- def readge(gefile: str) -> pd.DataFrame:
117
- """Extract data from Google Earth file & save as zip
118
- parameter gefile (str): absolute or relative path to Google Earth file
119
- parameter zipfile (str): absolute or relative path to output zip file
120
- Will read simpledata tags OR embedded tables in feature descriptions
121
- """
122
- # retrieve DataFrame from gefile and use its to_file method
123
  try:
124
- # this function pulls data from tables embedded in feature descriptions
125
- df = ge_togdf(gefile)
126
  except (pd.errors.ParserError, ValueError):
127
- # this function pulls data from simpledata tags
128
- df = simpledata_fromfile(gefile)
129
- return df
 
1
  import zipfile
2
+ from typing import Any
3
 
4
  import bs4
5
  import fiona
 
9
  fiona.drvsupport.supported_drivers["KML"] = "rw"
10
 
11
 
12
+ def parse_description_to_gdf(gdf: gpd.GeoDataFrame) -> gpd.GeoDataFrame:
13
+ parsed_dataframes = [
14
+ pd.read_html(desc)[1 if len(pd.read_html(desc)) > 1 else 0].T
15
+ for desc in gdf["Description"]
16
+ ]
17
+
18
+ for df in parsed_dataframes:
19
+ df.columns = df.iloc[0]
20
+ df.drop(df.index[0], inplace=True)
21
+
22
+ combined_df = pd.concat(parsed_dataframes, ignore_index=True)
23
+ combined_df["geometry"] = gdf["geometry"]
24
+
25
+ return gpd.GeoDataFrame(combined_df, crs=gdf.crs)
26
+
27
+
28
+ def read_kml_file(path: str) -> Any:
 
 
 
 
 
 
29
  with zipfile.ZipFile(path, "r") as kmz:
30
+ kml_files = [f for f in kmz.namelist() if f.endswith(".kml")]
31
+
32
+ if len(kml_files) != 1:
33
  raise IndexError(
34
+ "KMZ contains more than one KML. Extract or convert to multiple KMLs.",
35
  )
36
+
37
+ return gpd.read_file(f"zip://{path}\\{kml_files[0]}", driver="KML")
38
 
39
 
40
+ def parse_file_to_gdf(path: str) -> gpd.GeoDataFrame:
 
41
  if path.endswith(".kml"):
42
+ return parse_description_to_gdf(gpd.read_file(path, driver="KML"))
43
+
44
+ if path.endswith(".kmz"):
45
+ return parse_description_to_gdf(read_kml_file(path))
46
+
47
+ raise ValueError("File must end with .kml or .kmz")
48
+
49
+
50
+ def extract_data_from_kml_code(kml_code: str) -> pd.DataFrame:
51
+ soup = bs4.BeautifulSoup(kml_code, "html.parser")
52
+ rows = soup.find_all("schemadata")
53
+
54
+ data = (
 
 
 
 
 
 
55
  {field.get("name"): field.text for field in row.find_all("simpledata")}
56
+ for row in rows
57
  )
58
+
59
+ return pd.DataFrame(data)
60
+
61
+
62
+ def extract_kml_from_file(file_path: str) -> str:
63
+ file_extension = file_path.lower().split(".")[-1]
64
+ kml_files = None
65
+
66
+ if file_extension == "kml":
67
+ with open(file_path, "r") as kml:
68
+ return kml.read()
69
+
70
+ if file_extension == "kmz":
71
+ with zipfile.ZipFile(file_path) as kmz:
72
+ kml_files = [f for f in kmz.namelist() if f.lower().endswith(".kml")]
73
+ if len(kml_files) != 1:
74
+ raise IndexError(
75
+ "KMZ contains more than one KML. Extract or convert to multiple KMLs.",
76
+ )
77
+ with kmz.open(kml_files[0]) as kml:
78
+ return kml.read().decode()
79
+
80
+ raise ValueError("File path must end with .kml or .kmz")
81
+
82
+
83
+ def extract_data_from_file(file_path: str) -> pd.DataFrame:
84
+ df = extract_data_from_kml_code(extract_kml_from_file(file_path))
85
+
86
+ if file_path.endswith(".kmz"):
87
+ file_gdf = read_kml_file(file_path)
 
 
 
 
 
 
 
 
 
 
88
  else:
89
+ file_gdf = gpd.read_file(file_path, driver="KML")
90
+
91
+ return gpd.GeoDataFrame(df, geometry=file_gdf["geometry"], crs=file_gdf.crs)
92
+
93
+
94
+ def read_ge_file(file_path: str) -> pd.DataFrame:
 
 
 
 
 
 
95
  try:
96
+ return parse_file_to_gdf(file_path)
 
97
  except (pd.errors.ParserError, ValueError):
98
+ return extract_data_from_file(file_path)