Data Visualization - Geo

Data Visualization - Geo#

import urllib.request
import json
import pandas as pd

propertyid = "Property:P856"
url = (
    "https://www.wikidata.org/w/api.php?action=query&prop=revisions&titles="
    + propertyid
    + "&rvprop=timestamp|comment|ids&rvlimit=500&format=json"
)
response = urllib.request.urlopen(url)
responsedata = json.loads(response.read().decode("utf-8"))

array = []

for data in responsedata["query"]["pages"]:
    for revision in responsedata["query"]["pages"][data]["revisions"]:
        print(revision.keys())
        break

dict_keys(['revid', 'parentid', 'timestamp', 'comment'])

result = list(responsedata["query"]["pages"].values())[0]
revisions = result["revisions"]
revision = result["revisions"][0].keys()
revision

dict_keys(['revid', 'parentid', 'timestamp', 'comment'])

array = []

for revision in revisions:
    array.append([revision["timestamp"], revision["comment"]])

array[:1]

[['2024-07-11T17:55:27Z',
  '/* wbsetclaim-update:2||1|1 */ [[Property:P2302]]: [[Q21510851]]']]

dataframe = pd.DataFrame(array, columns=["timestamp", "modification"])
dataframe['timestamp'] = pd.to_datetime(dataframe['timestamp'], format='%Y-%m-%dT%H:%M:%SZ')
dataframe = dataframe.astype(
    dtype={"modification": "<U1000"}
)

dataframe["timestamp"] = pd.to_datetime(
    dataframe["timestamp"],
    format="%d/%m/%Y",
    unit="D",
    errors="raise",
).dt.normalize()
dataframe

	timestamp	modification
0	2024-07-11	/* wbsetclaim-update:2\|\|1\|1 */ [[Property:P230...
1	2024-07-03	/* wbsetclaim-update:2\|\|1\|1 */ [[Property:P230...
2	2024-07-03	/* wbsetclaim-update:2\|\|1\|7 */ [[Property:P230...
3	2024-06-19	/* wbsetaliases-add:5\|mzn */ وبسایت, وبگاه, سا...
4	2024-06-19	/* wbsetlabel-add:1\|mzn */ رسمی سایت
...	...	...
495	2020-03-10	/* wbsetlabel-add:1\|mi */ pae tukutuku
496	2020-03-08	/* wbsetclaim-update:2\|\|1\|2 */ [[Property:P230...
497	2020-03-08	/* wbsetclaim-update:2\|\|1\|2 */ [[Property:P230...
498	2020-03-08	/* undo:0\|\|1131502789\|Trade */ I suggest a bot...
499	2020-03-08	/* wbsetclaim-create:2\|\|1 */ [[Property:P2302]...

500 rows × 2 columns

group = dataframe.groupby("timestamp").agg("count").reset_index()
group

	timestamp	modification
0	2020-03-08	4
1	2020-03-10	1
2	2020-03-14	1
3	2020-03-16	7
4	2020-03-17	5
...	...	...
234	2024-05-28	1
235	2024-06-04	2
236	2024-06-19	2
237	2024-07-03	2
238	2024-07-11	1

239 rows × 2 columns

group.describe()

	timestamp	modification
count	239	239.000000
mean	2022-05-10 12:51:12.803347200	2.092050
min	2020-03-08 00:00:00	1.000000
25%	2021-04-08 12:00:00	1.000000
50%	2022-07-06 00:00:00	1.000000
75%	2023-05-19 12:00:00	2.000000
max	2024-07-11 00:00:00	64.000000
std	NaN	4.456124

group.dtypes

timestamp       datetime64[ns]
modification             int64
dtype: object

group.modification.max()

np.int64(64)

group.modification.min()

np.int64(1)

import calplot

days = group["timestamp"]

events = pd.Series(list(group["modification"]), index=days)
calplot.calplot(
    events,
    cmap="YlGn",
    yearlabel_kws={"color": "black"},
    linewidth=0,
    suptitle="Number of modifications per day (last 500)",
)

findfont: Font family 'Helvetica' not found.

findfont: Font family 'Helvetica' not found.

findfont: Font family 'Helvetica' not found.

findfont: Font family 'Helvetica' not found.

findfont: Font family 'Helvetica' not found.

findfont: Font family 'Helvetica' not found.

(<Figure size 1250x850 with 6 Axes>,
 array([<Axes: ylabel='2020'>, <Axes: ylabel='2021'>,
        <Axes: ylabel='2022'>, <Axes: ylabel='2023'>,
        <Axes: ylabel='2024'>], dtype=object))

findfont: Font family 'Helvetica' not found.

findfont: Font family 'Helvetica' not found.

findfont: Font family 'Helvetica' not found.

findfont: Font family 'Helvetica' not found.

findfont: Font family 'Helvetica' not found.

findfont: Font family 'Helvetica' not found.

findfont: Font family 'Helvetica' not found.

findfont: Font family 'Helvetica' not found.

findfont: Font family 'Helvetica' not found.

findfont: Font family 'Helvetica' not found.

findfont: Font family 'Helvetica' not found.

findfont: Font family 'Helvetica' not found.

findfont: Font family 'Helvetica' not found.

findfont: Font family 'Helvetica' not found.

findfont: Font family 'Helvetica' not found.

findfont: Font family 'Helvetica' not found.

../_images/73dfca26999875279058b713dc227253476a45c4b5b9ea656341c774fca90f0e.png

import plotly.express as px

fig = px.histogram(
    group,
    x="timestamp",
    y="modification",
    nbins=20,
    labels={"x": "month", "y": "count"},
)
fig.update_layout(bargap=0.2)
fig.show()

import plotly.express as px

fig = px.scatter(
    group, x="timestamp", y="modification", labels={"x": "month", "y": "count"}
)
fig.update_layout(bargap=0.2)
fig.show()

import plotly.express as px

fig = px.histogram(group, x="modification", histnorm="probability density", nbins=20)
fig.update_layout(bargap=0.2)
fig.show()

import sys
from SPARQLWrapper import SPARQLWrapper, JSON

endpoint_url = "https://query.wikidata.org/sparql"

query = """SELECT ?heritagesite ?label ?coordinates {
  ?heritagesite wdt:P1435 wd:Q9259;
        wdt:P17 wd:Q142;
        rdfs:label ?label;
        wdt:P625 ?coordinates.
        FILTER (lang(?label)="en")
}
LIMIT 100
"""


def get_results(endpoint_url, query):
    user_agent = "WDQS-example Python/%s.%s" % (
        sys.version_info[0],
        sys.version_info[1],
    )
    sparql = SPARQLWrapper(endpoint_url, agent=user_agent)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    return sparql.query().convert()


results = get_results(endpoint_url, query)

array = []
for result in results["results"]["bindings"]:
    array.append(
        [
            result["heritagesite"]["value"],
            result["label"]["value"],
            result["coordinates"]["value"],
        ]
    )

from shapely import wkt
import geopandas as gp

dataframe = pd.DataFrame(array, columns=["ID", "label", "coordinates"])
dataframe["coordinates"] = dataframe["coordinates"].apply(wkt.loads)

geodataframe = gp.GeoDataFrame(dataframe, geometry="coordinates")

array[:1]

[['http://www.wikidata.org/entity/Q8399',
  'Vieux Lyon',
  'Point(4.828056 45.763056)']]

geodataframe.dtypes

ID               object
label            object
coordinates    geometry
dtype: object

import folium

map = folium.Map(location=[45.1093, 2.6753], zoom_start=6)
geo_df_list = [[point.xy[1][0], point.xy[0][0]] for point in geodataframe.geometry]

i = 0
for coordinates in geo_df_list:
    map.add_child(
        folium.Marker(
            location=coordinates,
            icon=folium.Icon(color="green"),
            popup="Wikidata: <a target='_blank' href='"
            + str(geodataframe.ID[i])
            + "'>"
            + str(geodataframe.label[i]),
        )
    )
    i = i + 1

map

Make this Notebook Trusted to load map: File -> Trust Notebook