After reading this answer and getting a better understanding of scons hiearchy, I was wondering if it is possible to tell scons that a target as dependency to some lib files. My goal is, if a developers type scons -u on a sub folder, this will also compile library needed at linkage.
See my script below. Written like this, developer has to build manually each of the lib before Program can work.
Import('common_env')
#Grab a copy of the top environment (the one sent by the SConstruct file)
common_env = common_env.Clone()
#Because this component is only compiled in win32.
if (common_env['ENV']['CONFIG'] == "win32"):
#Grabs the library name, the name should look like libpath_of_current_component-(debug/opt)
libName = common_env.libName()
progName = 'testWinRTP.exe'
common_env.USE_BOOST()
common_env.USE_ACE()
common_env.USE_LOKI()
common_env.USE_DIRECTSOUND()
#Specific cppflag for this component (appended to the flags sent from Sconstruct)
cppFlags = '-D_AFXDLL'
#Specific linkFlag for this component (appended to the flags sent from SConstruct)
linkFlags = '/SUBSYSTEM:WINDOWS'
#All the libraries the binary needs.
libSuffix = common_env['ENV']['OPTSUFF'] + '.lib'
common_env.Append(CPPFLAGS = cppFlags, LINKFLAGS = linkFlags)
common_env.Append(LIBS=File('#/build/'+ common_env['ENV']['OPTSUFF'] +'/fwk/audio_fwk/src/tests/winRTP/testwinRTP/' + common_env['ENV']['CONFIG'] + '/libfwk_audio_fwk_src_tests_winRTP_testwinRTP-' + libSuffix))
common_env.Append(LIBS=File('#/build/'+ common_env['ENV']['OPTSUFF'] +'/fwk/audio_fwk/src/tests/winRTP/CCNSMT/' + common_env['ENV']['CONFIG'] + '/libfwk_audio_fwk_src_tests_winRTP_CCNSMT-' + libSuffix))
common_env.Append(LIBS=File('#/build/'+ common_env['ENV']['OPTSUFF'] +'/fwk/audio_fwk/src/filter_graph_mgt/' + common_env['ENV']['CONFIG'] + '/libfwk_audio_fwk_src_filter_graph_mgt-' + libSuffix))
common_env.Append(LIBS=File('#/build/'+ common_env['ENV']['OPTSUFF'] +'/fwk/audio_fwk/src/filter_graph/' + common_env['ENV']['CONFIG'] + '/libfwk_audio_fwk_src_filter_graph-' + libSuffix))
common_env.Append(LIBS=File('#/build/'+ common_env['ENV']['OPTSUFF'] +'/fwk/audio_fwk/src/filter_graph_drivers/'+ common_env['ENV']['CONFIG'] + '/libfwk_audio_fwk_src_filter_graph_drivers-' + libSuffix))
common_env.Append(LIBS=File('#/build/'+ common_env['ENV']['OPTSUFF'] +'/fwk/audio_fwk/src/filter_graph_utils/' + common_env['ENV']['CONFIG'] + '/libfwk_audio_fwk_src_filter_graph_utils-' + libSuffix))
common_env.Append(LIBS=File('#/build/'+ common_env['ENV']['OPTSUFF'] +'/fwk/voice_fwk/utils/src/voice_utils/' + common_env['ENV']['CONFIG'] + '/libfwk_voice_fwk_utils_src_voice_utils-' + libSuffix))
common_env.Append(LIBS=File('#/build/'+ common_env['ENV']['OPTSUFF'] +'/fwk/voice_fwk/utils/src/config/' + common_env['ENV']['CONFIG'] + '/libfwk_voice_fwk_utils_src_config-' + libSuffix))
common_env.Append(LIBS=File('#/build/'+ common_env['ENV']['OPTSUFF'] +'/fwk/voice_fwk/utils/src/log_utils/' + common_env['ENV']['CONFIG'] + '/libfwk_voice_fwk_utils_src_log_utils-' + libSuffix))
common_env.Append(LIBS=File('#/build/'+ common_env['ENV']['OPTSUFF'] +'/fwk/voice_fwk/utils/src/vcs_utils/' + common_env['ENV']['CONFIG'] + '/libfwk_voice_fwk_utils_src_vcs_utils-' + libSuffix))
common_env.Append(LIBS='msacm32.lib')
#Sources of the library.
sourcesLib = ['CFilterGraphTest.cpp', 'stdafx.cpp', 'testWinRTPDlg.cpp']
#Creates the library
common_env.Library(libName, sourcesLib)
#Compiles a ressource file needed for the binary
compileRes = common_env.RES('testWinRTP.rc')
#Creates the program, notice that the sources of this program includes the .res generated by the compilation of the .rc file.
sourcesBin = ['testWinRTP.cpp', 'testWinRTP.res']
common_env.Program(progName, sourcesBin)
#Install (copy) the binary in LINK/bin/winX
common_env.installInLink(progName)
thx
Related
I wrote a script with Python which allows me to move files according their extension, to another folder , if it is a Music file then the file goes to the Music folder and so on. The thing is that I have to run the program inside the directory even though i coded it to run in the folders I type in, for the program to work properly, if i'm outside the folder it tells me that the file already exist in the folder which is going to be moved, which is not true, so i have to place myself in the folder and run the script for it to work as it was programmed. I'm kinda new here, I was looking if I can find a solution but I couldn't find it, if anyone can help me, I would appreciate it. Here is the code:
import os
import shutil
docs = [
".doc",
".docx",
".odt",
".pdf",
".rtf",
".txt",
".wpd",
".xls",
".xlsx",
".xml",
".ppt",
".pptx",
".pps",
".ppsx",
".odp",
".csv",
".py",
".pyc",
".pyo",
".pyw",
".c",
".cc",
".cxx",
".h",
".hh",
".hpp",
".hxx",
".m",
".mm",
".pl",
".pm",
".pyc",
".pyo",
".rst",
".xhtml",
".yml",
".epub",
]
audio = [".mp3", ".wav", ".wma", ".aac", ".flac", ".ogg"]
images = [" .bmp", ".gif", ".jpg", ".jpeg", ".png", ".psd", ".tiff"]
video = [".avi", ".flv", ".mov", ".mp4", ".mpg", ".rm", ".swf", ".wmv"]
files = [
".7z",
".arj",
".bz2",
".cab",
".gz",
".rar",
".tar",
".tgz",
".zip",
".deb",
".iso",
".rpm",
".msi",
".exe",
".AppImage",
".flatpakref",
]
vpn = [".ovpn"]
fonts = [" .ttf", ".otf", ".woff", ".woff2"]
dir = input("Enter the directory: ")
path=os.listdir(dir)
def is_audio(file):
return os.path.splitext(file)[1] in audio
def is_image(file):
return os.path.splitext(file)[1] in images
def is_video(file):
return os.path.splitext(file)[1] in video
def is_doc(file):
return os.path.splitext(file)[1] in docs
def is_file(file):
return os.path.splitext(file)[1] in files
def is_vpn(file):
return os.path.splitext(file)[1] in vpn
def is_font(file):
return os.path.splitext(file)[1] in fonts
for file in path:
try:
if is_audio(file):
shutil.move(file, "/home/zephyr/Music")
print("Moved " + file + " to Music")
elif is_image(file):
shutil.move(file, "/home/zephyr/Pictures")
print("Moved " + file + " to Pictures")
elif is_video(file):
shutil.move(file, "/home/zephyr/Videos")
print("Moved " + file + " to Videos")
elif is_doc(file):
shutil.move(file, "/home/zephyr/Documents")
print("Moved " + file + " to Documents")
elif is_file(file):
shutil.move(file, "/home/zephyr/Programs")
print("Moved " + file + " to Programs")
except:
print(f"{file} Not Moved {FileExistsError}")
pass
I am trying to write to my Azure Synapse Server from Databricks, but I keep getting the error:
Azure Synapse Analytics failed to execute the JDBC query produced by the connector
The code is as follows:
blobStorage = "*******.blob.core.windows.net"
blobContainer = "synapsestagecontainer"
blobAccessKey = "***************"
tempDir = "wasbs://" + blobContainer + "#" + blobStorage +"/tempDirs"
acntInfo = "fs.azure.account.key."+ blobStorage
sc._jsc.hadoopConfiguration().set(acntInfo, blobAccessKey)
dwDatabase = "carlspool"
dwServer = "carlssynapseworkspace"
dwUser = "techadmin#carlssynapseworkspace"
dwPass = "*******"
dwJdbcPort = "1433"
dwJdbcExtraOptions = "encrypt=true;trustServerCertificate=true;hostNameInCertificate=*.database.windows.net;loginTimeout=30;"
sqlDwUrl = "jdbc:sqlserver://" + dwServer + ".database.windows.net:" + dwJdbcPort + ";database=" + dwDatabase + ";user=" + dwUser+";password=" + dwPass + ";$dwJdbcExtraOptions"
sqlDwUrlSmall = "jdbc:sqlserver://" + dwServer + ".database.windows.net:" + dwJdbcPort + ";database=" + dwDatabase + ";user=" + dwUser+";password=" + dwPass
spark.conf.set(
"spark.sql.parquet.writeLegacyFormat",
"true")
example1.write.format("com.databricks.spark.sqldw").option("url", sqlDwUrlSmall).option("dbtable", "SampleTable12").option("forward_spark_azure_storage_credentials","True") .option("tempdir", tempDir).mode("overwrite").save()
The full stack trace is a follows:
Py4JJavaError Traceback (most recent call last)
<command-3898875195714724> in <module>
4 "true")
5
----> 6 example1.write.format("com.databricks.spark.sqldw").option("url", sqlDwUrlSmall).option("dbtable", "SampleTable12").option("forward_spark_azure_storage_credentials","True") .option("tempdir", tempDir).mode("overwrite").save()
/databricks/spark/python/pyspark/sql/readwriter.py in save(self, path, format, mode, partitionBy, **options)
1132 self.format(format)
1133 if path is None:
-> 1134 self._jwrite.save()
1135 else:
1136 self._jwrite.save(path)
/databricks/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py in __call__(self, *args)
1302
1303 answer = self.gateway_client.send_command(command)
-> 1304 return_value = get_return_value(
1305 answer, self.gateway_client, self.target_id, self.name)
1306
/databricks/spark/python/pyspark/sql/utils.py in deco(*a, **kw)
115 def deco(*a, **kw):
116 try:
--> 117 return f(*a, **kw)
118 except py4j.protocol.Py4JJavaError as e:
119 converted = convert_exception(e.java_exception)
/databricks/spark/python/lib/py4j-0.10.9-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
324 value = OUTPUT_CONVERTER[type](answer[2:], gateway_client)
325 if answer[1] == REFERENCE_TYPE:
--> 326 raise Py4JJavaError(
327 "An error occurred while calling {0}{1}{2}.\n".
328 format(target_id, ".", name), value)
Py4JJavaError: An error occurred while calling o1761.save.
: com.databricks.spark.sqldw.SqlDWSideException: Azure Synapse Analytics failed to execute the JDBC query produced by the connector.
Underlying SQLException(s):
- com.microsoft.sqlserver.jdbc.SQLServerException: HdfsBridge::recordReaderFillBuffer - Unexpected error encountered filling record reader buffer: HadoopSqlException: String or binary data would be truncated. [ErrorCode = 107090] [SQLState = S0001]
at com.databricks.spark.sqldw.Utils$.wrapExceptions(Utils.scala:686)
at com.databricks.spark.sqldw.DefaultSource.createRelation(DefaultSource.scala:89)
at org.apache.spark.sql.execution.datasources.SaveIntoDataSourceCommand.run(SaveIntoDataSourceCommand.scala:48)
at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult$lzycompute(commands.scala:75)
at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult(commands.scala:73)
at org.apache.spark.sql.execution.command.ExecutedCommandExec.doExecute(commands.scala:96)
at org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:196)
at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:240)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:165)
at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:236)
at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:192)
at org.apache.spark.sql.execution.QueryExecution.toRdd$lzycompute(QueryExecution.scala:167)
at org.apache.spark.sql.execution.QueryExecution.toRdd(QueryExecution.scala:166)
at org.apache.spark.sql.DataFrameWriter.$anonfun$runCommand$1(DataFrameWriter.scala:1079)
at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withCustomExecutionEnv$5(SQLExecution.scala:126)
at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:267)
at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withCustomExecutionEnv$1(SQLExecution.scala:104)
at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:852)
at org.apache.spark.sql.execution.SQLExecution$.withCustomExecutionEnv(SQLExecution.scala:77)
at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:217)
at org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:1079)
at org.apache.spark.sql.DataFrameWriter.saveToV1Source(DataFrameWriter.scala:468)
at org.apache.spark.sql.DataFrameWriter.saveInternal(DataFrameWriter.scala:438)
at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:311)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:380)
at py4j.Gateway.invoke(Gateway.java:295)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:251)
at java.lang.Thread.run(Thread.java:748)
Caused by: java.sql.SQLException: Exception thrown in awaitResult:
at com.databricks.spark.sqldw.JDBCWrapper.executeInterruptibly(SqlDWJDBCWrapper.scala:137)
at com.databricks.spark.sqldw.JDBCWrapper.$anonfun$executeInterruptibly$1(SqlDWJDBCWrapper.scala:115)
at com.databricks.spark.sqldw.JDBCWrapper.$anonfun$executeInterruptibly$1$adapted(SqlDWJDBCWrapper.scala:115)
at com.databricks.spark.sqldw.JDBCWrapper.withPreparedStatement(SqlDWJDBCWrapper.scala:362)
at com.databricks.spark.sqldw.JDBCWrapper.executeInterruptibly(SqlDWJDBCWrapper.scala:115)
at com.databricks.spark.sqldw.SqlDwWriter.$anonfun$saveToSqlDW$6(SqlDwWriter.scala:239)
at scala.runtime.java8.JFunction0$mcZ$sp.apply(JFunction0$mcZ$sp.java:23)
at com.databricks.backend.daemon.driver.ProgressReporter$.withStatusCode(ProgressReporter.scala:377)
at com.databricks.backend.daemon.driver.ProgressReporter$.withStatusCode(ProgressReporter.scala:363)
at com.databricks.spark.util.SparkDatabricksProgressReporter$.withStatusCode(ProgressReporter.scala:34)
at com.databricks.spark.sqldw.SqlDwWriter.$anonfun$saveToSqlDW$1(SqlDwWriter.scala:197)
at com.databricks.spark.sqldw.SqlDwWriter.$anonfun$saveToSqlDW$1$adapted(SqlDwWriter.scala:73)
at com.databricks.spark.sqldw.JDBCWrapper.withConnection(SqlDWJDBCWrapper.scala:340)
at com.databricks.spark.sqldw.SqlDwWriter.saveToSqlDW(SqlDwWriter.scala:73)
at com.databricks.spark.sqldw.DefaultSource.$anonfun$createRelation$3(DefaultSource.scala:122)
at com.databricks.spark.sqldw.Utils$.wrapExceptions(Utils.scala:655)
... 34 more
Caused by: com.microsoft.sqlserver.jdbc.SQLServerException: HdfsBridge::recordReaderFillBuffer - Unexpected error encountered filling record reader buffer: HadoopSqlException: String or binary data would be truncated.
at com.microsoft.sqlserver.jdbc.SQLServerException.makeFromDatabaseError(SQLServerException.java:262)
at com.microsoft.sqlserver.jdbc.SQLServerStatement.getNextResult(SQLServerStatement.java:1632)
at com.microsoft.sqlserver.jdbc.SQLServerPreparedStatement.doExecutePreparedStatement(SQLServerPreparedStatement.java:602)
at com.microsoft.sqlserver.jdbc.SQLServerPreparedStatement$PrepStmtExecCmd.doExecute(SQLServerPreparedStatement.java:524)
at com.microsoft.sqlserver.jdbc.TDSCommand.execute(IOBuffer.java:7418)
at com.microsoft.sqlserver.jdbc.SQLServerConnection.executeCommand(SQLServerConnection.java:3272)
at com.microsoft.sqlserver.jdbc.SQLServerStatement.executeCommand(SQLServerStatement.java:247)
at com.microsoft.sqlserver.jdbc.SQLServerStatement.executeStatement(SQLServerStatement.java:222)
at com.microsoft.sqlserver.jdbc.SQLServerPreparedStatement.execute(SQLServerPreparedStatement.java:505)
at com.databricks.spark.sqldw.JDBCWrapper.$anonfun$executeInterruptibly$2(SqlDWJDBCWrapper.scala:115)
at com.databricks.spark.sqldw.JDBCWrapper.$anonfun$executeInterruptibly$2$adapted(SqlDWJDBCWrapper.scala:115)
at com.databricks.spark.sqldw.JDBCWrapper.$anonfun$executeInterruptibly$3(SqlDWJDBCWrapper.scala:129)
at scala.concurrent.Future$.$anonfun$apply$1(Future.scala:659)
at scala.util.Success.$anonfun$map$1(Try.scala:255)
at scala.util.Success.map(Try.scala:213)
at scala.concurrent.Future.$anonfun$map$1(Future.scala:292)
at scala.concurrent.impl.Promise.liftedTree1$1(Promise.scala:33)
at scala.concurrent.impl.Promise.$anonfun$transform$1(Promise.scala:33)
at scala.concurrent.impl.CallbackRunnable.run(Promise.scala:64)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
I know there are other people that have experienced this problem with Databricks, and I have try to apply the answers to my situation but I can't get it to work.
The full error is:
com.databricks.spark.sqldw.SqlDWSideException: Azure Synapse Analytics failed to execute the JDBC query produced by the connector.
I am running Runtime 8.3
i think you need to ensure the schema exists. I was doing the same thing without the schema already created. I created it manually and my code ran.
I´ve strugled a few days with the same error until I get the code bellow . I´ve also created a SECRET SCOPE, EXTERNAL DATA SOURCE and EXTERNAL FILE FORMAT in my synapse dedicated pool
from pyspark.sql import *
from pyspark.sql.types import *
from pyspark.sql.functions import *
df_gold = spark.read.format('delta').load('dbfs:/mnt/datalake/gold')
df = df_gold.select('faceId', 'name')
blobStorage = "<your storage name>.blob.core.windows.net"
blobContainer = "<your container name>"
blobAccessKey = "<your storage key>"
tempDir = "wasbs://" + blobContainer + "#" + blobStorage +"/tempDirs"
acntInfo = "fs.azure.account.key."+ blobStorage
sc._jsc.hadoopConfiguration().set(acntInfo, blobAccessKey)
dwDatabase = "<your pool name>"
dwServer = "<your workspace name>.database.windows.net"
dwUser = "user"
dwPass = "pass"
dwJdbcPort = "1433"
dwJdbcExtraOptions = "encrypt=true;trustServerCertificate=true;hostNameInCertificate=*.database.windows.net;loginTimeout=30;"
sqlDwUrl = "jdbc:sqlserver://" + dwServer + ":" + dwJdbcPort + ";database=" + dwDatabase + ";user=" + dwUser+";password=" + dwPass + ";$dwJdbcExtraOptions"
sqlDwUrlSmall = "jdbc:sqlserver://" + dwServer + ":" + dwJdbcPort + ";database=" + dwDatabase + ";user=" + dwUser+";password=" + dwPass
spark.conf.set(
"spark.sql.parquet.writeLegacyFormat",
"true")
(df
.write
.format("com.databricks.spark.sqldw")
.option("url", sqlDwUrlSmall)
.option("dbtable", "SampleTable")
.option( "forward_spark_azure_storage_credentials","True")
.option("tempdir", tempDir)
.mode("overwrite")
.save())
I'm new to web scraping and I'm trying to do it on this page https://www.metrocuadrado.com/bogota.
The idea is to extract all the information. So far I have been able to do it with only one page but I do not know how to do it with pagination. Is there any way to do it based on the code I already have?
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
# opening up connection, grabbing html
my_url = 'https://www.metrocuadrado.com/bogota'
uClient = uReq(my_url)
page_html = uClient.read()
uClient.close()
# html parser
page_soup = soup(page_html, "html.parser")
# grabs each product
containers = page_soup.findAll("div",{"class":"detail_wrap"})
filename = "metrocuadrado.csv"
f = open(filename, "w")
headers= "propertytype, businestype, cityname, neighborhood, description, price, area\n"
f.write(headers)
for container in containers:
property_type = container[propertytype]
busines_type = container[businestype]
city_name = container[cityname]
neighborhood_location = container[neighborhood]
description = container.div.a.img["alt"]
price_container = container.findAll("span",{"itemprop":"price"})
price = price_container[0].text
area_container = container.findAll("div",{"class":"m2"})
area = area_container[0].p.span.text
print("property_type: " + property_type)
print("busines_type: " + busines_type)
print("city_name: " + city_name)
print("neighborhood_location: " + neighborhood_location)
print("description: " + description)
print("price: " + price)
print("area: " + area)
f.write(property_type + "," + busines_type + "," + city_name + "," + neighborhood_location + "," + description.replace(",", "|") + "," + price + "," + area + "\n")
f.close()
You are going to need to scrape each page (likely in a loop), do this by figuring out what the call is to get page 2, page 3 etc. You can try to figure that out by looking at the page source code or using developer tools from your browser and looking at the network calls.
My goal is to scrape some specific data on multiple profile pages on khan academy. And put the data on a csv file.
Here is the code to scrape one specific profile page and put it on a csv:
from bs4 import BeautifulSoup
from requests_html import HTMLSession
session = HTMLSession()
r = session.get('https://www.khanacademy.org/profile/DFletcher1990/')
r.html.render(sleep=5)
soup=BeautifulSoup(r.html.html,'html.parser')
user_info_table=soup.find('table', class_='user-statistics-table')
dates,points,videos=[tr.find_all('td')[1].text for tr in user_info_table.find_all('tr')]
user_socio_table=soup.find_all('div', class_='discussion-stat')
data = {}
for gettext in user_socio_table:
category = gettext.find('span')
category_text = category.text.strip()
number = category.previousSibling.strip()
data[category_text] = number
filename = "khanscraptry1.csv"
f = open(filename, "w")
headers = "date, points, videos, questions, votes, answers, flags, project_request, project_replies, comments, tips_thx\n"
f.write(headers)
f.write(dates + "," + points.replace("," , "") + "," + videos + "," + data['questions'] + "," + data['votes'] + "," + data['answers'] + "," + data['flags raised'] + "," + data['project help requests'] + "," + data['project help replies'] + "," + data['comments'] + "," + data['tips and thanks'] + "\n")
f.close()
This code is working fine with this specific link('https://www.khanacademy.org/profile/DFletcher1990/').
Now though when I change my link to an other profile on khan academy for example : 'https://www.khanacademy.org/profile/Kkasparas/'
I get this error :
KeyError: 'project help requests'
This is normal because on this profile "https://www.khanacademy.org/profile/Kkasparas/" there is no project help requests value (and no project help replies either).
Thus data['project help requests'] and data['project help replies'] don't exist and thus can't be written on the csv file.
My goal is to run this script with many profile pages.
So I would like to know how to put an NA in every case I will not get the data on each variable. And then print te NA's to the csv file.
In other words : I would like to make my script work for any kind of user profile page.
Many thanks in advance for your contributions :)
You could define a new list with all possible headers and set the value of keys that are not present to 'NA', before writing it to the file.
full_data_keys=['questions','votes','answers','flags raised','project help requests','project help replies','comments','tips and thanks']
for header_value in full_data_keys:
if header_value not in data.keys():
data[header_value]='NA'
Also gentle reminder to provide a fully working code in your question. user_socio_table was not defined in the question. I had to look up your previous question to get that.
Full code would be
from bs4 import BeautifulSoup
from requests_html import HTMLSession
session = HTMLSession()
r = session.get('https://www.khanacademy.org/profile/Kkasparas/')
r.html.render(sleep=5)
soup=BeautifulSoup(r.html.html,'html.parser')
user_info_table=soup.find('table', class_='user-statistics-table')
dates,points,videos=[tr.find_all('td')[1].text for tr in user_info_table.find_all('tr')]
data = {}
user_socio_table=soup.find_all('div', class_='discussion-stat')
for gettext in user_socio_table:
category = gettext.find('span')
category_text = category.text.strip()
number = category.previousSibling.strip()
data[category_text] = number
full_data_keys=['questions','votes','answers','flags raised','project help requests','project help replies','comments','tips and thanks']
for header_value in full_data_keys:
if header_value not in data.keys():
data[header_value]='NA'
filename = "khanscraptry1.csv"
f = open(filename, "w")
headers = "date, points, videos, questions, votes, answers, flags, project_request, project_replies, comments, tips_thx\n"
f.write(headers)
f.write(dates + "," + points.replace("," , "") + "," + videos + "," + data['questions'] + "," + data['votes'] + "," + data['answers'] + "," + data['flags raised'] + "," + data['project help requests'] + "," + data['project help replies'] + "," + data['comments'] + "," + data['tips and thanks'] + "\n")
f.close()
Ouput - khanscraptry1.csv
date, points, videos, questions, votes, answers, flags, project_request, project_replies, comments, tips_thx
6 years ago,1527829,1123,25,100,2,0,NA,NA,0,0
Change to the following lines if user_info_table is not present
if user_info_table is not None:
dates,points,videos=[tr.find_all('td')[1].text for tr in user_info_table.find_all('tr')]
else:
dates=points=videos='NA'
Question:
names = ['Edogawa, Conan, 100', 'Kaitu, kid, 90', 'Mouri, Ran, 70']
I need to store these information in a dictionary like this:
{'Edogawa Conan': 100, 'Kaitu kid': 90, 'Mouri Ran:' 70}
I tried this code but it's too long and not efficient.
names1 = names[0].split(',')
names2 = names[1].split(',')
names3 = names[2].split(',')
names_dict = {}
names_dict[names1[0] + ' ' + names1[1]]= int(names1[2])
names_dict[names2[0] + ' ' + names2[1]]= int(names2[2])
names_dict[names3[0] + ' ' + names3[1]]= int(names2[2])
>>> dict((x + y, int(z)) for (x, y, z) in (w.split(',') for w in ['Edogawa, Conan, 100', 'Kaitu, kid, 90', 'Mouri, Ran, 70']))
{'Mouri Ran': 70, 'Edogawa Conan': 100, 'Kaitu kid': 90}